Esempio n. 1
0
def test_get_partial_table_wrong_input():
    delay = 10
    data = ML_prepare(delay)

    section = "blah"
    with pytest.raises(AssertionError):
        t = data.get_partial_table(x_section=section, y_labels=True)
Esempio n. 2
0
def test_get_columns_of_sections():
    delay = 2
    data = ML_prepare(delay)
    res = data.get_columns_of_sections()
    assert isinstance(res, tuple)
    for i in range(3):
        assert isinstance(res[i], list)
Esempio n. 3
0
def confusion_matrix_SVC(label: str, section: str, delay: int):
    data = ML_prepare(delay)
    table_xy = data.get_partial_table(x_section=section, y_labels=True)
    X = table_xy.loc[:, 'x']
    y = table_xy['y', label]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    clf = make_pipeline(StandardScaler(),
                        LinearSVC(random_state=0, tol=1e-5, max_iter=100000))
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    le = preprocessing.LabelEncoder()
    le.fit(list(y_pred))
    y_pred = le.transform(y_pred)
    y_test = le.transform(np.array(y_test))
    cnf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure()
    np.set_printoptions(precision=2)
    plot_confusion_matrix(cnf_matrix,
                          classes=le.classes_,
                          normalize=False,
                          title='Confusion matrix, without normalization')
    plt.savefig(
        f"figures/SVC/LinearSVC_Confusion matrix, without normalization.png",
        bbox_inches="tight")
Esempio n. 4
0
def test_get_partial_table_no_nans():
    delay = 10
    data = ML_prepare(delay)

    sections = ["all", "total_counts", "filaments", "various"]
    for section in sections:
        t = data.get_partial_table(x_section=section)
        assert not t.isnull().values.any()
Esempio n. 5
0
def test_get_partial_table_x_sections_lengths():
    delay = 6
    data = ML_prepare(delay)

    sections = {"all": 27, "total_counts": 9, "filaments": 9, "various": 18}
    for section in sections:
        t = data.get_partial_table(x_section=section)
        assert t["x"].shape[1] == sections[section]
Esempio n. 6
0
def test_get_partial_table_y_labels():
    delay = 0
    data = ML_prepare(delay)

    t1 = data.get_partial_table(x_section="all", y_labels=True)
    assert t1["y"].columns.to_list() == ["SV_label", "SVI_label"]

    t2 = data.get_partial_table(x_section="all", y_labels=False)
    assert t2["y"].columns.to_list() == ["Settling_velocity", "SVI"]
Esempio n. 7
0
def test_find_closest_date_out_of_range():
    delay = 8
    data = ML_prepare(delay)
    date0 = data._svi_lst[0].index[-1]

    # this is out of range:
    res = data._find_closest_date(bio_reactor_i=0, date0=date0)

    assert not res  # assert res is False
Esempio n. 8
0
def test_regr_model_func():
    data = ML_prepare(1)
    table_xy = data.get_partial_table(x_section="various", y_labels=False)
    X, y = table_xy.loc[:, "x"], table_xy.loc[:, "y"]
    models_dict = create_models_dict()
    regr_model = list(models_dict.keys())[0]
    score, fitted_model = regr_model_func(X, y, regr_model)
    assert isinstance(score, float)

    module = getattr(fitted_model, "__module__", None)
    assert "sklearn" in module
Esempio n. 9
0
def test_find_closest_date_missing_date():
    # find missing date in svi
    delay = 8
    data = ML_prepare(delay)
    date0 = data._micro_lst[0].index[0]
    missing_date = date0 + timedelta(days=data.delay)
    # remove the date
    data._svi_lst[0].drop(missing_date, inplace=True)

    # assert that this date was really changed 1 back
    found_date = data._find_closest_date(bio_reactor_i=0, date0=date0)
    assert found_date + timedelta(days=1) == missing_date
Esempio n. 10
0
def test_loop_over_sections_and_y():
    data = ML_prepare(7)
    models_dict = create_models_dict()
    regr_model = list(models_dict.keys())[0]
    res = loop_over_sections_and_y(data, regr_model)
    assert isinstance(res, tuple)
    assert len(res) == 8
Esempio n. 11
0
def choose_k_value(section: str, label: str, delay: int):
    """
    plot graph of scores_k by k_range for KNeighbors model.
    The user need to choose the k value by the result.

    Parameters
    ----------
    section : str
        microorganisms section - should be: all or total_counts or filaments or various
    label : str
        label - should be: SV_label or SVI_label
    delay : int
        delay between the microscopic test which took place and the SVI test

    return
    ----------
    k : int
        k choosen by the user
    """
    if section not in {'all', 'total_counts', 'filaments', 'various'}:
        raise ValueError(
            "Please supply a valid section value: 'all','total_counts', 'filaments', 'various' "
        )
    if label not in {'SV_label', 'SVI_label'}:
        raise ValueError(
            "Please supply a valid label value: 'SV_label', 'SVI_label' ")
    data = ML_prepare(delay)
    table_xy = data.get_partial_table(x_section=section, y_labels=True)
    X = table_xy.loc[:, 'x']
    y = table_xy['y', label]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    scores_data = check_K_values(20, X_train, y_train, X_test, y_test)
    sns.set()
    sns.stripplot(data=scores_data, x='k', y='score', size=10)
    plt.ylim(0, 1)
    print(
        "please look at the graph and choose k value. press enter to continue")
    input()
    plt.show()
    print("please insert k value")
    k = int(input())
    return k
Esempio n. 12
0
def test_init_types():
    delay = 3
    data = ML_prepare(delay)

    # check types and lengths
    assert isinstance(data, ML_prepare)
    assert data._delay == delay
    assert isinstance(data._svi_lst, list)
    assert isinstance(data._micro_lst, list)
Esempio n. 13
0
def test_init_df_lists():
    delay = 8
    data = ML_prepare(delay)

    assert len(data._svi_lst) == 4
    assert len(data._micro_lst) == 4
    for i in range(4):
        assert isinstance(data._svi_lst[i], pd.DataFrame)
        assert isinstance(data._micro_lst[i], pd.DataFrame)
Esempio n. 14
0
def get_day3_filaments_svi_data():
    '''
    Generates the most promising data producing regression results:
    filaments microscopic data, and svi results of 3 days later.

    return
    --------
    filaments_x: pd.DataFrame
        microscopic measurements of all filament organisms.
    filaments_svi: pd.Series
        matching svi results of 3 days later
    '''
    data = ML_prepare(delay=3)
    filaments_table = data.get_partial_table(x_section="filaments",
                                             y_labels=False)
    filaments_x = filaments_table.loc[:, "x"]
    filaments_svi = filaments_table.loc[:, ("y", "SVI")]
    return filaments_x, filaments_svi
Esempio n. 15
0
def create_score_list_Knn(labels: list, sections: list, delay_lst: list,
                          k: int) -> list:
    """
    create KNeighbors score list of all the prediction by label type, section of microorganism and delay

    Parameters
    ----------
    labels : list
        list of labels: SV and SVI
    sections : list
        list of microorganisms sections: all, total_counts, filaments and various
    delay_lst : list
        list of delays between the microscopic test which took place and the SVI test
    K : int
        k value for KNeighbors model

    return
    ----------
    score_delay : list
        score list of score predictions for each combination of label, section and delay
    """

    if k <= 0:
        raise ValueError("Please supply k value > 0 ")
    score_delay = []
    for label in labels:
        for section in sections:
            for delay in delay_lst:
                data = ML_prepare(delay)
                table_xy = data.get_partial_table(x_section=section,
                                                  y_labels=True)
                X = table_xy.loc[:, 'x']
                y = table_xy['y', label]
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.25, random_state=42)
                knn = KNeighborsClassifier(n_neighbors=k)
                knn.fit(X_train, y_train)
                y_predict = (knn.predict(X_test))
                score_label = score_by_label(y_test, y_predict)
                score = knn.score(X_test, y_test)
                score_label.append(score)
                score_delay.append(score_label)
    return score_delay
Esempio n. 16
0
def create_score_list_SVC(labels: list, sections: list,
                          delay_lst: list) -> list:
    """
    create LinearSVC score list of all the prediction by label type, section of microorganism and delay

    Parameters
    ----------
    labels : list
        list of labels: SV and SVI
    sections : list
        list of microorganisms section: all, total_counts, filaments and various
    delay_lst : list
        list of delays between the microscopic test which took place and the SVI test

    return
    ----------
    score_delay : list
        score list of score predictions for each combination of label, section and delay
    """

    score_delay = []
    for label in labels:
        for section in sections:
            for delay in delay_lst:
                data = ML_prepare(delay)
                table_xy = data.get_partial_table(x_section=section,
                                                  y_labels=True)
                X = table_xy.loc[:, 'x']
                y = table_xy['y', label]
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=0.25, random_state=42)
                clf = make_pipeline(
                    StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5, max_iter=100000))
                clf.fit(X_train, y_train)
                y_predict = (clf.predict(X_test))
                score_label = score_by_label(y_test, y_predict)
                score = clf.score(X_test, y_test)
                score_label.append(score)
                score_delay.append(score_label)
                # print(f"result for delay= {delay}, section= {section}, label={label} : score_bad={score_label[0]}, score_reasonable={score_label[1]}, score_good={score_label[2]}, score={score_label[3]}")
    return score_delay
Esempio n. 17
0
def test_indirect_join_x_y():
    delay = 3
    data = ML_prepare(delay)
    x1 = data.delay_table.loc[:, "micro"].reset_index(drop=True)
    x2 = data.x.reset_index(drop=True)

    y1 = data.delay_table.loc[:, "svi"].reset_index(drop=True)
    y2 = data.y.reset_index(drop=True)

    assert x1.equals(x2)
    assert y1.equals(y2)
Esempio n. 18
0
def test_indirect_create_x_y_bioreactor():
    delay = 10
    data = ML_prepare(delay)

    delay_time = timedelta(days=data.delay)

    # assert that date difference is correct
    data._y.loc["3", :].index[0] - data._x.loc["3", :].index[0] == delay_time

    # assert that out_of_range dates were removed from x as well
    assert data._micro_lst[3].shape[0] == data._x.loc["4", :].shape[0] + 1
Esempio n. 19
0
def test_create_list_of_tidy_df_by_day_types():
    data = ML_prepare(4)
    models_dict = create_models_dict()
    delay_range = range(8, 14)
    scores_models_dict = get_scores_of_all_models(
        models_dict, delay_range=delay_range, print_flag=False
    )
    days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict, delay_range)

    assert isinstance(days_df_dict, dict)
    assert set(days_df_dict.keys()) == set(delay_range)
    assert isinstance(days_df_dict[8], pd.DataFrame)
Esempio n. 20
0
def test_create_list_of_tidy_df_by_day_values():
    data = ML_prepare(4)
    models_dict = create_models_dict()
    delay_range = range(1, 13)
    scores_models_dict = get_scores_of_all_models(
        models_dict, delay_range=delay_range, print_flag=False
    )
    days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict, delay_range)

    df4 = days_df_dict[4]
    assert df4.shape == (40, 4)
    assert all(df4["model"].value_counts() == 8)
    assert all(df4["sv_svi"].value_counts() == 20)
    assert all(df4["section"].value_counts() == 10)
Esempio n. 21
0
def get_scores_of_model(regr_model,
                        model_name: str,
                        delay_range,
                        print_flag: bool = True):
    '''
    Generates for a given model, a dictionary with 
    scores for all delay options, and all 8 "x" and "y" combinations.
    Using helper functions.

    Parameters
    ---------
    regr_model: sklearn model object
    model_name: str
    delay_range: range
    print_flag: bool
        Print or not all results
    
    return
    ---------
    scores_by_delay_dict: dict

    '''

    scores_by_delay_dict = {}
    for delay in delay_range:
        data = ML_prepare(delay=delay)
        scores_by_delay_dict[delay] = loop_over_sections_and_y(
            data=data, regr_model=regr_model)

    if print_flag:
        print(f"\nmodel: {model_name}")
        for delay in scores_by_delay_dict:
            tup_delay = scores_by_delay_dict[delay]
            max_score = max(tup_delay)
            name = [
                tup_delay._fields[i] for i in range(len(tup_delay))
                if tup_delay[i] == max_score
            ]
            print(
                f"max score for delay {delay}\t {max_score:.2f}, for {name[0]}"
            )

    return scores_by_delay_dict
Esempio n. 22
0
def create_section_and_PCA(data: ML_prepare, labled: bool = False):
    """
    Creates PCA for every section (organism group) of the data:
    "all", "filaments", "total_counts", "various".
    Using helper function "pca_plot".
    Plots by the "y", results, whether labeled or not.

    Parameters
    ----------
    data: ML_prepare
    labled: bool
    """
    section_lst = ["all", "filaments", "total_counts", "various"]
    fig, ax = plt.subplots(4, 2)
    for i in range(len(section_lst)):
        table_xy = data.get_partial_table(x_section=section_lst[i],
                                          y_labels=labled)
        y_cols = table_xy.loc[:, "y"].columns.tolist()
        for j in range(2):
            ### model on y = y_cols[j]
            pca_plot(table_xy,
                     color_col=y_cols[j],
                     section=section_lst[i],
                     ax_i=ax[i, j])
    fig.set_figheight(15)
    fig.set_figwidth(15)
    fig.suptitle(
        f"PCA of groups, colored by output, delay = {data.delay} days",
        fontsize=20,
        y=1.02,
    )
    plt.tight_layout()
    fig_name = "PCA_by_groups"
    if labled:
        fig_name = fig_name + "_labled"

    plt.tight_layout()
    fig.savefig("figures/" + fig_name + ".png", dpi=150, bbox_inches="tight")
    plt.show()
Esempio n. 23
0
def loop_over_sections_and_y(data: ML_prepare, regr_model):
    '''
    Generates for a given model, for a given delayed data,
    scores of all 8 combinations:
            "all_sv",
            "all_svi",
            "filaments_sv",
            "filaments_svi",
            "total_counts_sv",
            "total_counts_svi",
            "various_sv",
            "various_svi"
    Saves results in namedtuple.

    Parameters
    ---------
    data: ML_prepare
        allready generated with chosen delay
    regr_model: sklearn model object

    return
    ---------
    tup_scores: namedtuple, 
    '''
    scores_lst = []
    section_lst = ["all", "filaments", "total_counts", "various"]
    for i in range(len(section_lst)):
        table_xy = data.get_partial_table(x_section=section_lst[i],
                                          y_labels=False)
        y_cols = table_xy.loc[:, "y"].columns.tolist()
        for j in range(2):
            X = table_xy.loc[:, "x"]
            y = table_xy.loc[:, ("y", y_cols[j])]
            score, _ = regr_model_func(X, y, regr_model)
            scores_lst.append(score)

    tup_scores = insert_scores_to_namedtuple(scores_lst)
    return tup_scores
Esempio n. 24
0
                  fontsize=20,
                  y=1.00)
    sns.set()
    g = sns.stripplot(data=df_weights, x="organism", y="weight", size=10)
    g.set_xlabel("Organism", fontsize=20)
    g.set_ylabel("weight in model", fontsize=20)
    plt.xticks(rotation=90)
    fig3.savefig("figures/Last_model_coefs.png", dpi=150, bbox_inches="tight")
    plt.show()

    return df_weights


if __name__ == "__main__":
    # PCA for first glance, using 4 days delay
    data = ML_prepare(delay=4)
    create_section_and_PCA(data, labled=True)
    create_section_and_PCA(data, labled=False)

    # Create models and desired delay range
    models_dict = create_models_dict()
    delay_range = range(1, 13)

    # get scores for all models for all sections, and plot them by days
    scores_models_dict = get_scores_of_all_models(models_dict,
                                                  delay_range=delay_range,
                                                  print_flag=False)
    days_df_dict = create_list_of_tidy_df_by_day(scores_models_dict,
                                                 delay_range)
    days_swarmplot(days_df_dict)
Esempio n. 25
0
def test_indirect_read_and_index_svi_tables():
    delay = 2
    data = ML_prepare(delay)
    for i in range(4):
        assert isinstance(data._svi_lst[i].index, pd.DatetimeIndex)
        assert isinstance(data._svi_lst[i].index, pd.DatetimeIndex)
Esempio n. 26
0
def test_indirect_create_x_y_delayed_length():
    delay = 8
    data = ML_prepare(delay)

    assert data._x.shape[0] == data._y.shape[0]
Esempio n. 27
0
def test_init_df_xy():
    delay = 5
    data = ML_prepare(delay)
    assert isinstance(data._x, pd.DataFrame)
    assert isinstance(data._y, pd.DataFrame)
    assert isinstance(data.delay_table, pd.DataFrame)