def test_get_partial_table_wrong_input(): delay = 10 data = ML_prepare(delay) section = "blah" with pytest.raises(AssertionError): t = data.get_partial_table(x_section=section, y_labels=True)
def test_get_columns_of_sections(): delay = 2 data = ML_prepare(delay) res = data.get_columns_of_sections() assert isinstance(res, tuple) for i in range(3): assert isinstance(res[i], list)
def confusion_matrix_SVC(label: str, section: str, delay: int): data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5, max_iter=100000)) y_pred = clf.fit(X_train, y_train).predict(X_test) le = preprocessing.LabelEncoder() le.fit(list(y_pred)) y_pred = le.transform(y_pred) y_test = le.transform(np.array(y_test)) cnf_matrix = confusion_matrix(y_test, y_pred) plt.figure() np.set_printoptions(precision=2) plot_confusion_matrix(cnf_matrix, classes=le.classes_, normalize=False, title='Confusion matrix, without normalization') plt.savefig( f"figures/SVC/LinearSVC_Confusion matrix, without normalization.png", bbox_inches="tight")
def test_get_partial_table_no_nans(): delay = 10 data = ML_prepare(delay) sections = ["all", "total_counts", "filaments", "various"] for section in sections: t = data.get_partial_table(x_section=section) assert not t.isnull().values.any()
def test_get_partial_table_x_sections_lengths(): delay = 6 data = ML_prepare(delay) sections = {"all": 27, "total_counts": 9, "filaments": 9, "various": 18} for section in sections: t = data.get_partial_table(x_section=section) assert t["x"].shape[1] == sections[section]
def test_get_partial_table_y_labels(): delay = 0 data = ML_prepare(delay) t1 = data.get_partial_table(x_section="all", y_labels=True) assert t1["y"].columns.to_list() == ["SV_label", "SVI_label"] t2 = data.get_partial_table(x_section="all", y_labels=False) assert t2["y"].columns.to_list() == ["Settling_velocity", "SVI"]
def test_find_closest_date_out_of_range(): delay = 8 data = ML_prepare(delay) date0 = data._svi_lst[0].index[-1] # this is out of range: res = data._find_closest_date(bio_reactor_i=0, date0=date0) assert not res # assert res is False
def test_regr_model_func(): data = ML_prepare(1) table_xy = data.get_partial_table(x_section="various", y_labels=False) X, y = table_xy.loc[:, "x"], table_xy.loc[:, "y"] models_dict = create_models_dict() regr_model = list(models_dict.keys())[0] score, fitted_model = regr_model_func(X, y, regr_model) assert isinstance(score, float) module = getattr(fitted_model, "__module__", None) assert "sklearn" in module
def test_find_closest_date_missing_date(): # find missing date in svi delay = 8 data = ML_prepare(delay) date0 = data._micro_lst[0].index[0] missing_date = date0 + timedelta(days=data.delay) # remove the date data._svi_lst[0].drop(missing_date, inplace=True) # assert that this date was really changed 1 back found_date = data._find_closest_date(bio_reactor_i=0, date0=date0) assert found_date + timedelta(days=1) == missing_date
def test_loop_over_sections_and_y(): data = ML_prepare(7) models_dict = create_models_dict() regr_model = list(models_dict.keys())[0] res = loop_over_sections_and_y(data, regr_model) assert isinstance(res, tuple) assert len(res) == 8
def choose_k_value(section: str, label: str, delay: int): """ plot graph of scores_k by k_range for KNeighbors model. The user need to choose the k value by the result. Parameters ---------- section : str microorganisms section - should be: all or total_counts or filaments or various label : str label - should be: SV_label or SVI_label delay : int delay between the microscopic test which took place and the SVI test return ---------- k : int k choosen by the user """ if section not in {'all', 'total_counts', 'filaments', 'various'}: raise ValueError( "Please supply a valid section value: 'all','total_counts', 'filaments', 'various' " ) if label not in {'SV_label', 'SVI_label'}: raise ValueError( "Please supply a valid label value: 'SV_label', 'SVI_label' ") data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) scores_data = check_K_values(20, X_train, y_train, X_test, y_test) sns.set() sns.stripplot(data=scores_data, x='k', y='score', size=10) plt.ylim(0, 1) print( "please look at the graph and choose k value. press enter to continue") input() plt.show() print("please insert k value") k = int(input()) return k
def test_init_types(): delay = 3 data = ML_prepare(delay) # check types and lengths assert isinstance(data, ML_prepare) assert data._delay == delay assert isinstance(data._svi_lst, list) assert isinstance(data._micro_lst, list)
def test_init_df_lists(): delay = 8 data = ML_prepare(delay) assert len(data._svi_lst) == 4 assert len(data._micro_lst) == 4 for i in range(4): assert isinstance(data._svi_lst[i], pd.DataFrame) assert isinstance(data._micro_lst[i], pd.DataFrame)
def get_day3_filaments_svi_data(): ''' Generates the most promising data producing regression results: filaments microscopic data, and svi results of 3 days later. return -------- filaments_x: pd.DataFrame microscopic measurements of all filament organisms. filaments_svi: pd.Series matching svi results of 3 days later ''' data = ML_prepare(delay=3) filaments_table = data.get_partial_table(x_section="filaments", y_labels=False) filaments_x = filaments_table.loc[:, "x"] filaments_svi = filaments_table.loc[:, ("y", "SVI")] return filaments_x, filaments_svi
def create_score_list_Knn(labels: list, sections: list, delay_lst: list, k: int) -> list: """ create KNeighbors score list of all the prediction by label type, section of microorganism and delay Parameters ---------- labels : list list of labels: SV and SVI sections : list list of microorganisms sections: all, total_counts, filaments and various delay_lst : list list of delays between the microscopic test which took place and the SVI test K : int k value for KNeighbors model return ---------- score_delay : list score list of score predictions for each combination of label, section and delay """ if k <= 0: raise ValueError("Please supply k value > 0 ") score_delay = [] for label in labels: for section in sections: for delay in delay_lst: data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42) knn = KNeighborsClassifier(n_neighbors=k) knn.fit(X_train, y_train) y_predict = (knn.predict(X_test)) score_label = score_by_label(y_test, y_predict) score = knn.score(X_test, y_test) score_label.append(score) score_delay.append(score_label) return score_delay
def create_score_list_SVC(labels: list, sections: list, delay_lst: list) -> list: """ create LinearSVC score list of all the prediction by label type, section of microorganism and delay Parameters ---------- labels : list list of labels: SV and SVI sections : list list of microorganisms section: all, total_counts, filaments and various delay_lst : list list of delays between the microscopic test which took place and the SVI test return ---------- score_delay : list score list of score predictions for each combination of label, section and delay """ score_delay = [] for label in labels: for section in sections: for delay in delay_lst: data = ML_prepare(delay) table_xy = data.get_partial_table(x_section=section, y_labels=True) X = table_xy.loc[:, 'x'] y = table_xy['y', label] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, random_state=42) clf = make_pipeline( StandardScaler(), LinearSVC(random_state=0, tol=1e-5, max_iter=100000)) clf.fit(X_train, y_train) y_predict = (clf.predict(X_test)) score_label = score_by_label(y_test, y_predict) score = clf.score(X_test, y_test) score_label.append(score) score_delay.append(score_label) # print(f"result for delay= {delay}, section= {section}, label={label} : score_bad={score_label[0]}, score_reasonable={score_label[1]}, score_good={score_label[2]}, score={score_label[3]}") return score_delay
def test_indirect_join_x_y(): delay = 3 data = ML_prepare(delay) x1 = data.delay_table.loc[:, "micro"].reset_index(drop=True) x2 = data.x.reset_index(drop=True) y1 = data.delay_table.loc[:, "svi"].reset_index(drop=True) y2 = data.y.reset_index(drop=True) assert x1.equals(x2) assert y1.equals(y2)
def test_indirect_create_x_y_bioreactor(): delay = 10 data = ML_prepare(delay) delay_time = timedelta(days=data.delay) # assert that date difference is correct data._y.loc["3", :].index[0] - data._x.loc["3", :].index[0] == delay_time # assert that out_of_range dates were removed from x as well assert data._micro_lst[3].shape[0] == data._x.loc["4", :].shape[0] + 1
def test_create_list_of_tidy_df_by_day_types(): data = ML_prepare(4) models_dict = create_models_dict() delay_range = range(8, 14) scores_models_dict = get_scores_of_all_models( models_dict, delay_range=delay_range, print_flag=False ) days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict, delay_range) assert isinstance(days_df_dict, dict) assert set(days_df_dict.keys()) == set(delay_range) assert isinstance(days_df_dict[8], pd.DataFrame)
def test_create_list_of_tidy_df_by_day_values(): data = ML_prepare(4) models_dict = create_models_dict() delay_range = range(1, 13) scores_models_dict = get_scores_of_all_models( models_dict, delay_range=delay_range, print_flag=False ) days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict, delay_range) df4 = days_df_dict[4] assert df4.shape == (40, 4) assert all(df4["model"].value_counts() == 8) assert all(df4["sv_svi"].value_counts() == 20) assert all(df4["section"].value_counts() == 10)
def get_scores_of_model(regr_model, model_name: str, delay_range, print_flag: bool = True): ''' Generates for a given model, a dictionary with scores for all delay options, and all 8 "x" and "y" combinations. Using helper functions. Parameters --------- regr_model: sklearn model object model_name: str delay_range: range print_flag: bool Print or not all results return --------- scores_by_delay_dict: dict ''' scores_by_delay_dict = {} for delay in delay_range: data = ML_prepare(delay=delay) scores_by_delay_dict[delay] = loop_over_sections_and_y( data=data, regr_model=regr_model) if print_flag: print(f"\nmodel: {model_name}") for delay in scores_by_delay_dict: tup_delay = scores_by_delay_dict[delay] max_score = max(tup_delay) name = [ tup_delay._fields[i] for i in range(len(tup_delay)) if tup_delay[i] == max_score ] print( f"max score for delay {delay}\t {max_score:.2f}, for {name[0]}" ) return scores_by_delay_dict
def create_section_and_PCA(data: ML_prepare, labled: bool = False): """ Creates PCA for every section (organism group) of the data: "all", "filaments", "total_counts", "various". Using helper function "pca_plot". Plots by the "y", results, whether labeled or not. Parameters ---------- data: ML_prepare labled: bool """ section_lst = ["all", "filaments", "total_counts", "various"] fig, ax = plt.subplots(4, 2) for i in range(len(section_lst)): table_xy = data.get_partial_table(x_section=section_lst[i], y_labels=labled) y_cols = table_xy.loc[:, "y"].columns.tolist() for j in range(2): ### model on y = y_cols[j] pca_plot(table_xy, color_col=y_cols[j], section=section_lst[i], ax_i=ax[i, j]) fig.set_figheight(15) fig.set_figwidth(15) fig.suptitle( f"PCA of groups, colored by output, delay = {data.delay} days", fontsize=20, y=1.02, ) plt.tight_layout() fig_name = "PCA_by_groups" if labled: fig_name = fig_name + "_labled" plt.tight_layout() fig.savefig("figures/" + fig_name + ".png", dpi=150, bbox_inches="tight") plt.show()
def loop_over_sections_and_y(data: ML_prepare, regr_model): ''' Generates for a given model, for a given delayed data, scores of all 8 combinations: "all_sv", "all_svi", "filaments_sv", "filaments_svi", "total_counts_sv", "total_counts_svi", "various_sv", "various_svi" Saves results in namedtuple. Parameters --------- data: ML_prepare allready generated with chosen delay regr_model: sklearn model object return --------- tup_scores: namedtuple, ''' scores_lst = [] section_lst = ["all", "filaments", "total_counts", "various"] for i in range(len(section_lst)): table_xy = data.get_partial_table(x_section=section_lst[i], y_labels=False) y_cols = table_xy.loc[:, "y"].columns.tolist() for j in range(2): X = table_xy.loc[:, "x"] y = table_xy.loc[:, ("y", y_cols[j])] score, _ = regr_model_func(X, y, regr_model) scores_lst.append(score) tup_scores = insert_scores_to_namedtuple(scores_lst) return tup_scores
fontsize=20, y=1.00) sns.set() g = sns.stripplot(data=df_weights, x="organism", y="weight", size=10) g.set_xlabel("Organism", fontsize=20) g.set_ylabel("weight in model", fontsize=20) plt.xticks(rotation=90) fig3.savefig("figures/Last_model_coefs.png", dpi=150, bbox_inches="tight") plt.show() return df_weights if __name__ == "__main__": # PCA for first glance, using 4 days delay data = ML_prepare(delay=4) create_section_and_PCA(data, labled=True) create_section_and_PCA(data, labled=False) # Create models and desired delay range models_dict = create_models_dict() delay_range = range(1, 13) # get scores for all models for all sections, and plot them by days scores_models_dict = get_scores_of_all_models(models_dict, delay_range=delay_range, print_flag=False) days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict, delay_range) days_swarmplot(days_df_dict)
def test_indirect_read_and_index_svi_tables(): delay = 2 data = ML_prepare(delay) for i in range(4): assert isinstance(data._svi_lst[i].index, pd.DatetimeIndex) assert isinstance(data._svi_lst[i].index, pd.DatetimeIndex)
def test_indirect_create_x_y_delayed_length(): delay = 8 data = ML_prepare(delay) assert data._x.shape[0] == data._y.shape[0]
def test_init_df_xy(): delay = 5 data = ML_prepare(delay) assert isinstance(data._x, pd.DataFrame) assert isinstance(data._y, pd.DataFrame) assert isinstance(data.delay_table, pd.DataFrame)