예제 #1
0
    def run_multi_type_regression(self, k_fold, test_size):
        task = "run_all_types_of_regression"
        X_y_files_path = pickle.load(
            open(
                os.path.join(self.load_and_save_path,
                             self.reg_X_y_files_pkl_path), "rb"))

        all_times_all_bact_results_path = os.path.join(
            self.load_and_save_path, task + "_" + str(k_fold) +
            "_fold_test_size_" + str(test_size) + "_results_df.csv")
        important_bacteria_reults_path = os.path.join(
            self.load_and_save_path,
            task + "_" + str(k_fold) + "_fold_test_size_" + str(test_size) +
            "_significant_bacteria_prediction_results_df.csv")
        conclusionss_path = os.path.join(
            self.load_and_save_path, task + "_" + str(k_fold) +
            "_fold_test_size_" + str(test_size) + "_conclusions.csv")

        # create data frames
        with open(os.path.join(self.load_and_save_path, "bacteria.txt"),
                  "r") as b_file:
            bacteria = b_file.readlines()
            bacteria = [b.rstrip() for b in bacteria]

        create_data_frames(
            all_res_path=all_times_all_bact_results_path,
            important_bacteria_reults_path=important_bacteria_reults_path)

        with open(os.path.join(self.load_and_save_path, X_y_files_path),
                  "r") as file:
            paths = file.readlines()
            paths = [p.strip('\n') for p in paths]

        for i, [bact, path] in enumerate(zip(bacteria, paths)):
            print(str(i) + " / " + str(len(bacteria)))
            all_times_all_bacteria_all_models_results_df = pd.read_csv(
                all_times_all_bact_results_path)

            X_trains, X_tests, y_trains, y_tests, name = \
                get_adapted_X_y_for_wanted_learning_task(self.load_and_save_path, path, "regular", k_fold, test_size)
            run_all_types_of_regression(
                X_trains, X_tests, y_trains, y_tests, i,
                all_times_all_bacteria_all_models_results_df,
                all_times_all_bact_results_path, bact)

        conclude_results(all_times_all_bacteria_all_models_results_df, 0,
                         conclusionss_path)
def run_single_bacteria(tax, bacteria_sorted_by_mse, best_bacteria_path, X_y_files_list_path, NN_or_RNN, results_df_title):

    with open(os.path.join(tax, X_y_files_list_path), "r") as file:
        multi_path = file.readline()
        multi_path = multi_path.strip('\n')

    with open(os.path.join(tax, "bacteria.txt"), "r") as b_file:
        bacteria = b_file.readlines()
        bacteria = [b.rstrip() for b in bacteria]

    # ------------------------------------ decide on mission ------------------------------------
    nni_ = False  # check model or run nni for real
    GPU = True if nni_ else False
    report_loss = True
    report_correlation = not report_loss
    k_fold = False  # run k fold
    RNN = True if NN_or_RNN == "RNN" else False
    NN = True if NN_or_RNN == "NN" else False
    representing_bacteria = False

    if nni_:
        params = nni.get_next_parameter()
    else:
        params = {"NN_STRUCTURE": "002L050H050H",
                  "TRAIN_TEST_SPLIT": 0.7,
                  "EPOCHS": 200,
                  "LEARNING_RATE": 1e-3,
                  "OPTIMIZER": "Adam",
                  "REGULARIZATION": 0.05,
                  "DROPOUT": 0}

    # ------------------------------------ data loading ------------------------------------
    # run a prediction of a single bacteria at a time
    # consider the average loss and correlation of all runs as the performance measurement

    if representing_bacteria:
        title = os.path.join(tax, ("NN_" if NN else "RNN_") + "10_representing_bacteria_" + results_df_title + ".csv")
    else:
        title = os.path.join(tax, ("NN_" if NN else "RNN_") + results_df_title + ".csv")

    all_results_df = pd.DataFrame(
        columns=["BACTERIA", "STRUCTURE", "LEARNING_RATE", "REGULARIZATION", "DROPOUT",
                 "TEST_MSE", "TEST_CORR", "TEST_R2",
                 "TRAIN_MSE", "TRAIN_CORR", "TRAIN_R2"])

    """
    previous_results = pd.read_csv(os.path.join(tax, bacteria_sorted_by_mse))
    sorted_df = pd.read_csv((os.path.join(tax, best_bacteria_path)))
    if representing_bacteria:
        indexes = get_10_representing_bacteria_indexes(previous_results)
    else:
    """
    indexes = range(len(bacteria))

    for STRUCTURE in ["001L025H", "001L050H", "001L100H", "001L200H", "002L025H025H", "002L050H050H", "002L100H100H"]:
        for LEARNING_RATE in [1e-2]:
            for REGULARIZATION in [0, 0.001, 0.01, 0.1, 0.5, 1, 1.5, 2, 2.5]:
                for DROPOUT in [0, 0.001, 0.01, 0.1, 0.2]:
                    epochs = 70
                    params = {"STRUCTURE": STRUCTURE,
                              "TRAIN_TEST_SPLIT": 0.7,
                              "EPOCHS": epochs,
                              "LEARNING_RATE": LEARNING_RATE,
                              "OPTIMIZER": "Adam",
                              "REGULARIZATION": REGULARIZATION,
                              "DROPOUT": DROPOUT}
                    print(params)

                    df = pd.DataFrame(
                        columns=["BACTERIA", "STRUCTURE", "LEARNING_RATE", "REGULARIZATION", "DROPOUT",
                                 "TEST_MSE", "TEST_CORR", "TEST_R2",
                                 "TRAIN_MSE", "TRAIN_CORR", "TRAIN_R2"])

                    for i in indexes:  # for loop every single bacteria
                        print(str(i) + " / " + str(len(indexes)))
                        #best_bacteria_num = sorted_df["BACTERIA_NUMBER"][i]
                        #reg_mse = sorted_df["MSE"][i]
                        #reg_rho = sorted_df["RHO"][i]
                        path = "time_serie_X_y_for_bacteria_number_" + str(i) + ".csv"
                        X, y, missing_values, name = get_adapted_X_y_for_wanted_learning_task(tax, path, "time_serie")
                        NUMBER_OF_SAMPLES = X.shape[0]
                        NUMBER_OF_TIME_POINTS = X.shape[1]
                        NUMBER_OF_BACTERIA = X.shape[2]

                        # ------------------------------------ send to network ------------------------------------
                        if RNN:
                            res_map = run_RNN(X, y, missing_values, name, tax, params,
                                                         NUMBER_OF_SAMPLES, NUMBER_OF_TIME_POINTS, NUMBER_OF_BACTERIA,
                                    GPU_flag=GPU, task_id=str(i))
                            rnn_loss = res_map["TEST"]["loss"]
                            rnn_corr = res_map["TEST"]["corr"]
                            rnn_r2 = res_map["TEST"]["r2"]

                            t_rnn_loss = res_map["TRAIN"]["loss"]
                            t_rnn_corr = res_map["TRAIN"]["corr"]
                            t_rnn_r2 = res_map["TRAIN"]["r2"]

                            # print(params)
                            print("loss=" + str(nn_loss))
                            print("corr=" + str(nn_corr))
                            print("r2=" + str(nn_r2))

                            df.loc[len(df)] = [int(i),
                                               STRUCTURE, LEARNING_RATE, REGULARIZATION, DROPOUT,
                                               rnn_loss, rnn_corr, rnn_r2,
                                               t_rnn_loss, t_rnn_corr, t_rnn_r2]

                        if NN:
                            flat_time_points_values_num = NUMBER_OF_SAMPLES * NUMBER_OF_TIME_POINTS

                            X = X.reshape(flat_time_points_values_num, NUMBER_OF_BACTERIA)
                            y = y.reshape(flat_time_points_values_num)
                            missing_values = missing_values.reshape(flat_time_points_values_num)

                            person_indexes = np.linspace(0, flat_time_points_values_num - 1, flat_time_points_values_num).\
                                reshape(NUMBER_OF_SAMPLES, NUMBER_OF_TIME_POINTS).astype(int).tolist()

                            res_map = run_NN(X, y, missing_values, params, name, tax,
                                                      NUMBER_OF_SAMPLES, NUMBER_OF_TIME_POINTS, NUMBER_OF_BACTERIA,
                                                      GPU_flag=GPU, k_fold=k_fold, task_id=str(i), person_indexes=person_indexes)
                            nn_loss = res_map["TEST"]["loss"]
                            nn_corr = res_map["TEST"]["corr"]
                            nn_r2 = res_map["TEST"]["r2"]

                            t_nn_loss = res_map["TRAIN"]["loss"]
                            t_nn_corr = res_map["TRAIN"]["corr"]
                            t_nn_r2 = res_map["TRAIN"]["r2"]

                            print(int(i))
                            df.loc[len(df)] = [int(i),
                                               STRUCTURE, LEARNING_RATE, REGULARIZATION, DROPOUT,
                                               nn_loss, nn_corr, nn_r2,
                                               t_nn_loss, t_nn_corr, t_nn_r2]

                    if nni_:
                        if report_loss:
                            nni.report_final_result(df["TEST_MSE"].mean())
                        elif report_correlation:
                            nni.report_final_result(df["TEST_CORR"].mean())

                    all_results_df.loc[len(all_results_df)] = ["average", STRUCTURE, LEARNING_RATE, REGULARIZATION, DROPOUT,
                                                   df["TEST_MSE"].mean(), df["TEST_CORR"].mean(), df["TEST_R2"].mean(),
                                                   df["TRAIN_MSE"].mean(), df["TRAIN_CORR"].mean(), df["TRAIN_R2"].mean()]
                    all_results_df.to_csv(title, index=False)

    print(not all_results_df.empty)
    if len(all_results_df) > 0:
        all_results_df.to_csv(os.path.join(tax, "NNI", title), index=False)
예제 #3
0
def predict_interaction_network_structure_using_change_in_data(
        bacteria_list, folder, CHANGE=0.5):
    """
    Complex models - feature importance calculation-
    NN is complex model which function as a black box that cannot be clearly deduced from the contribution of each
    feature.
    A different approach for inferring the importance of each bacterium in these models is required.

    Initially we trained the model to predict the change in each of the bacteria, we used the original data as an input.
    Now, although the equivalent to ‘coefficencts‘ is not visible to us, every bacterium has an effect, between none and
    extreme on the prediction.
    Therefore, we could estimate this effect by introducing modified input into the model and examining its effects on
    prediction.

    We chose to examine the relationship between each bacterial pair by using the existing fixed model that was trained
    for the ‘first’ bacterial prediction, and forward it an input that was modified only for the ‘second’ bacterium.
    U test can be used to investigate whether two independent samples were selected from populations having the same
    distribution. That is, the test can tell whether the distribution of predictions has changed significantly in light
    of the change in input - indicating interacting.
    Comparing the original prediction and the modified data’s prediction distributions, if the change between the two is
    significant according to U test, we conclude that there is interaction between the bacterial pair.

    The type of interaction will be determined by the obtained change- increasing or decreasing the count of the
     bacterium at a fixed size, and its effect, increase or decrease in the prediction of the count of bacteria.

    The change will be by the constant 'CHANGE'
    :param bacteria_list: (list) list of bacteria names.
    :param folder: (string) main dataset folder "DATASET/tax=x"
    :param CHANGE: (float) size of change in bacteria values.
    :return:
    """
    k_fold = 1
    test_size = 0.3
    bacteria_number_list = list(range(len(bacteria_list)))

    for regressor_name, regressor_clf in reg_name_to_func_map.items():
        print(regressor_name)
        df_title = os.path.join(
            folder,
            regressor_name.replace(" ", "_") +
            "_interaction_network_change_in_data_df.csv")
        df = pd.DataFrame(
            columns=["BACTERIA", "CHANGED_BACTERIA", "CHANGE", "Y"])
        df.to_csv(df_title, index=False)

        for b_i, bacteria_num in enumerate(
                bacteria_number_list):  # for each bacteria
            print(str(b_i) + " / " + str(len(bacteria_list)))
            df = pd.read_csv(df_title)
            path = "X_y_for_bacteria_number_" + str(bacteria_num) + ".csv"
            X_trains, X_tests, y_trains, y_tests, name = \
                get_adapted_X_y_for_wanted_learning_task(folder, path, "regular", k_fold, test_size)

            for bacteria_to_change_num in bacteria_number_list:
                for X_train, X_test, y_train, y_test in zip(
                        X_trains, X_tests, y_trains, y_tests):
                    X_positive_change = copy.deepcopy(X_test)
                    X_negative_change = copy.deepcopy(X_test)

                    for s_i, sample in enumerate(X_positive_change):
                        X_positive_change[s_i][
                            bacteria_to_change_num] += CHANGE
                    for s_i, sample in enumerate(X_negative_change):
                        X_negative_change[s_i][
                            bacteria_to_change_num] -= CHANGE

                    # regression
                    _, _, _, _, _, _, _, y_pred_no_change = regressor_clf(
                        X_train, X_test, y_train, y_test)
                    y_str = ""
                    for val in y_pred_no_change:
                        y_str += str(val) + " "
                    df.loc[len(df)] = [
                        int(bacteria_num),
                        int(-1), "no change", y_str
                    ]
                    _, _, _, _, _, _, _, y_pred_pos_change = regressor_clf(
                        X_train, X_positive_change, y_train, y_test)
                    y_str = ""
                    for val in y_pred_pos_change:
                        y_str += str(val) + " "
                    df.loc[len(df)] = [
                        int(bacteria_num),
                        int(bacteria_to_change_num), "plus " + str(CHANGE),
                        y_str
                    ]
                    _, _, _, _, _, _, _, y_pred_neg_change = regressor_clf(
                        X_train, X_negative_change, y_train, y_test)
                    y_str = ""
                    for val in y_pred_neg_change:
                        y_str += str(val) + " "
                    df.loc[len(df)] = [
                        int(bacteria_num),
                        int(bacteria_to_change_num), "minus " + str(CHANGE),
                        y_str
                    ]

            df.to_csv(df_title, index=False)
예제 #4
0
def predict_interaction_network_structure_using_change_in_data_auc_calc_trail(
        bacteria_list, folder, data_set_name, CHANGE=0.5):
    k_fold = 1
    test_size = 0.3
    p_value = 0.01
    bacteria_number_list = list(range(len(bacteria_list)))

    for regressor_name, regressor_clf in reg_name_to_func_map.items():
        # create a df that saves a binary value 1/0 => interaction/no interaction according to the train set
        train_binary_significant_df = pd.DataFrame(columns=bacteria_list)
        # create a df that saves the continuous b value of each bacteria according to the test set
        test_b_df = pd.DataFrame(columns=bacteria_list)

        print(regressor_name)
        df_title = os.path.join(
            folder,
            regressor_name.replace(" ", "_") +
            "_interaction_network_change_in_data_df.csv")
        df = pd.DataFrame(
            columns=["BACTERIA", "CHANGED_BACTERIA", "CHANGE", "Y"])
        df.to_csv(df_title, index=False)

        for b_i, bacteria_num in enumerate(
                bacteria_number_list):  # for each bacteria
            print(str(b_i) + " / " + str(len(bacteria_list)))
            train_binary_significant_for_b_i = []
            test_1_u_score_for_b_i = []
            df = pd.read_csv(df_title)
            path = "X_y_for_bacteria_number_" + str(bacteria_num) + ".csv"
            X_trains, X_tests, y_trains, y_tests, name = \
                get_adapted_X_y_for_wanted_learning_task(folder, path, "regular", k_fold, test_size)

            for bacteria_to_change_num in bacteria_number_list:
                X_train, X_test, y_train, y_test = X_trains[0], X_tests[
                    0], y_trains[0], y_tests[0]
                # TRAIN
                X_test_positive_change = copy.deepcopy(X_test)
                X_test_negative_change = copy.deepcopy(X_test)

                for s_i, sample in enumerate(X_test_positive_change):
                    X_test_positive_change[s_i][
                        bacteria_to_change_num] += CHANGE
                for s_i, sample in enumerate(X_test_negative_change):
                    X_test_negative_change[s_i][
                        bacteria_to_change_num] -= CHANGE

                # regression
                _, _, _, _, _, _, _, y_pred_no_change = regressor_clf(
                    X_train, X_test, y_train, y_test)
                y_str = ""
                for val in y_pred_no_change:
                    y_str += str(val) + " "
                df.loc[len(df)] = [
                    int(bacteria_num),
                    int(-1), "no change", y_str
                ]
                _, _, _, _, _, _, _, y_pred_pos_change = regressor_clf(
                    X_train, X_test_positive_change, y_train, y_test)
                y_str = ""
                for val in y_pred_pos_change:
                    y_str += str(val) + " "
                df.loc[len(df)] = [
                    int(bacteria_num),
                    int(bacteria_to_change_num), "plus " + str(CHANGE), y_str
                ]
                _, _, _, _, _, _, _, y_pred_neg_change = regressor_clf(
                    X_train, X_test_negative_change, y_train, y_test)
                y_str = ""
                for val in y_pred_neg_change:
                    y_str += str(val) + " "
                df.loc[len(df)] = [
                    int(bacteria_num),
                    int(bacteria_to_change_num), "minus " + str(CHANGE), y_str
                ]

                pos_u, pos_u_test_p_val = mannwhitneyu(y_pred_no_change,
                                                       y_pred_pos_change)
                neg_u, neg_u_test_p_val = mannwhitneyu(y_pred_no_change,
                                                       y_pred_neg_change)

                if pos_u_test_p_val < p_value and neg_u_test_p_val < p_value:
                    train_binary_significant_for_b_i.append(1)
                else:
                    train_binary_significant_for_b_i.append(0)

                # TEST
                X_train_positive_change = copy.deepcopy(X_train)
                X_train_negative_change = copy.deepcopy(X_train)

                for s_i, sample in enumerate(X_train_positive_change):
                    X_train_positive_change[s_i][
                        bacteria_to_change_num] += CHANGE
                for s_i, sample in enumerate(X_train_negative_change):
                    X_train_negative_change[s_i][
                        bacteria_to_change_num] -= CHANGE

                # regression
                _, _, _, _, _, _, _, y_pred_no_change = regressor_clf(
                    X_test, X_train, y_test, y_train)

                _, _, _, _, _, _, _, y_pred_pos_change = regressor_clf(
                    X_test, X_train_positive_change, y_test, y_train)

                _, _, _, _, _, _, _, y_pred_neg_change = regressor_clf(
                    X_test, X_train_negative_change, y_test, y_train)

                pos_u, pos_u_test_p_val = mannwhitneyu(y_pred_no_change,
                                                       y_pred_pos_change)
                neg_u, neg_u_test_p_val = mannwhitneyu(y_pred_no_change,
                                                       y_pred_neg_change)

                test_1_u_score_for_b_i.append((1 / pos_u, 1 / neg_u))

            # save bacteria b_i results
            df.to_csv(df_title, index=False)
            train_binary_significant_df.loc[
                len(train_binary_significant_df
                    )] = train_binary_significant_for_b_i
            test_b_df.loc[len(test_b_df)] = test_1_u_score_for_b_i

        # calculate AUC on the flatten data frame
        # positive change tuple[0]
        pos_b = []
        neg_b = []
        for row in test_b_df.values:
            for val in row:
                pos_b.append(float(val[0]))
                neg_b.append(float(val[1]))
        pos_b = np.array(pos_b)
        neg_b = np.array(neg_b)

        train_binary_significant_values = []
        for val in np.array(train_binary_significant_df.values).flatten():
            train_binary_significant_values.append(val)

        train_binary_significant_values = np.array(
            train_binary_significant_values)
        try:
            pos_auc = roc_auc_score(train_binary_significant_values, pos_b)
            neg_auc = roc_auc_score(train_binary_significant_values, neg_b)

            Networks_AUC_df = pd.read_csv("all_Networks_AUC.csv")
            Networks_AUC_df.loc[len(Networks_AUC_df)] = [
                "positive change", regressor_name, data_set_name, test_size,
                k_fold, pos_auc,
                datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
            ]
            Networks_AUC_df.loc[len(Networks_AUC_df)] = [
                "negative change", regressor_name, data_set_name, test_size,
                k_fold, neg_auc,
                datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
            ]
            Networks_AUC_df.to_csv("all_Networks_AUC.csv", index=False)

        except:
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            )
            print(len(train_binary_significant_values))
            print(set(train_binary_significant_values))
            print(len(pos_b))
            print(len(set(pos_b)))
            print(len(neg_b))
            print(len(set(neg_b)))
            print(
                "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
            )
예제 #5
0
def run_regression_coef_net(reg_type, k_fold, test_size, folder,
                            data_set_name):
    """
    :param k_fold: (int) returns K fold of the split to train and test
    :param test_size: ((float) the size of the test for the split to train and test
    :param folder: (string) files folder
    :param data_set_name: (string) dataset name - same as in folders.
    :return: no object is returned, AUC of the net is saved to "all_Networks_AUC.csv" file in the main folder
    """
    task = "interaction_network_structure_coef"
    all_times_all_bact_results_path = os.path.join(
        folder,
        reg_type.replace(" ", "_") + "_" + task + "_" + str(k_fold) +
        "_fold_test_size_" + str(test_size) + "_results_df.csv")
    important_bacteria_reults_path = os.path.join(
        folder,
        reg_type.replace(" ", "_") + "_" + task + "_" + str(k_fold) +
        "_fold_test_size_" + str(test_size) +
        "_significant_bacteria_prediction_results_df.csv")

    bacteria = [i for i in range(100)]

    create_data_frames(
        all_res_path=all_times_all_bact_results_path,
        important_bacteria_reults_path=important_bacteria_reults_path)

    paths = ["X_y_for_bacteria_number_" + str(i) + ".csv" for i in range(100)]

    train_binary_significant_from_all_bacteria = []
    test_b_list_from_all_bacteria = []

    for i, [bact, path] in enumerate(zip(bacteria, paths)):
        print(str(i) + " / " + str(len(bacteria)))

        all_times_all_bacteria_all_models_results_df = pd.read_csv(
            all_times_all_bact_results_path)
        important_bacteria_reults_df = pd.read_csv(
            important_bacteria_reults_path)
        X_trains, X_tests, y_trains, y_tests, name = \
            get_adapted_X_y_for_wanted_learning_task(folder, path, "regular", k_fold, test_size)

        results_df, train_binary_significant_list, test_b_list = \
            predict_interaction_network_structure_using_coeffs(X_trains, X_tests, y_trains, y_tests, i,
                                                               all_times_all_bacteria_all_models_results_df,
                                                               all_times_all_bact_results_path,
                                                               important_bacteria_reults_df,
                                                               important_bacteria_reults_path, bact, bacteria,
                                                               reg_type)
        # save bacteria y true nd y pred
        train_binary_significant_from_all_bacteria.append(
            list(np.array(train_binary_significant_list).flat))
        test_b_list_from_all_bacteria.append(list(np.array(test_b_list).flat))

    train_binary_significant_from_all_bacteria = list(
        np.array(train_binary_significant_from_all_bacteria).flat)
    test_b_list_from_all_bacteria = list(
        np.array(test_b_list_from_all_bacteria).flat)
    total_auc = roc_auc_score(
        y_true=train_binary_significant_from_all_bacteria,
        y_score=test_b_list_from_all_bacteria)

    Networks_AUC_df = pd.read_csv("all_Networks_AUC.csv")
    Networks_AUC_df.loc[len(Networks_AUC_df)] = [
        "coefficients", reg_type, data_set_name, test_size, k_fold, total_auc,
        datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
    ]
    Networks_AUC_df.to_csv("all_Networks_AUC.csv", index=False)
예제 #6
0
def run_nn_bacteria_network(tax, data_set_name):
    """
    Complex models - feature importance calculation-
    NN is complex model which function as a black box that cannot be clearly deduced from the contribution of each
    feature.
    A different approach for inferring the importance of each bacterium in these models is required.

    Initially we trained the model to predict the change in each of the bacteria, we used the original data as an input.
    Now, although the equivalent to ‘coefficencts‘ is not visible to us, every bacterium has an effect, between none and
    extreme on the prediction.
    Therefore, we could estimate this effect by introducing modified input into the model and examining its effects on
    prediction.

    We chose to examine the relationship between each bacterial pair by using the existing fixed model that was trained
    for the ‘first’ bacterial prediction, and forward it an input that was modified only for the ‘second’ bacterium.
    U test can be used to investigate whether two independent samples were selected from populations having the same
    distribution. That is, the test can tell whether the distribution of predictions has changed significantly in light
    of the change in input - indicating interacting.
    Comparing the original prediction and the modified data’s prediction distributions, if the change between the two is
    significant according to U test, we conclude that there is interaction between the bacterial pair.

    The type of interaction will be determined by the obtained change- increasing or decreasing the count of the
     bacterium at a fixed size, and its effect, increase or decrease in the prediction of the count of bacteria.

    The change will be by the constant 'CHANGE', you can alter it

    :param tax: (string) main dataset folder "DATASET/tax=x"
    :return: doesn't return an object, create a csv file with all the results
    csv name = "interaction_network_" + params.__str__().replace(" ", "").replace("'", "") + "_df.csv"
    To determine which bacteria have interactions and create a visual graph use "built_network_form_file.py"
    sent the results csv path
    """

    # sub folder path for network results,
    folder = os.path.join(tax, "interaction_network")
    if not os.path.exists(folder):
        os.mkdir(folder)
    # ------------------------------------ decide on mission ------------------------------------
    nni_ = False  # check model or run nni for real
    GPU = True if nni_ else False
    report_loss = True
    report_correlation = not report_loss
    single_bacteria = True  # learn the change in single bacteria or all in once
    k_fold = False  # run k fold
    p_value = 0.001
    test_size = 0.3

    if nni_:
        params = nni.get_next_parameter()
    else:
        params = {
            "STRUCTURE": "001L200H",
            "TRAIN_TEST_SPLIT": 0.7,
            "EPOCHS": 70,
            "LEARNING_RATE": 1e-3,
            "OPTIMIZER": "Adam",
            "REGULARIZATION": 0.01,
            "DROPOUT": 0.1
        }

    with open(os.path.join(tax, "bacteria.txt"), "r") as b_file:
        bacteria = b_file.readlines()
        bacteria = [b.rstrip() for b in bacteria]

    bacteria_number_list = range(len(bacteria))
    # ------------------------------------ data loading ------------------------------------
    # run a prediction of a single bacteria at a time
    # consider the average loss and correlation of all runs as the performance measurement
    df_title = os.path.join(
        folder, "interaction_network_" +
        params.__str__().replace(" ", "").replace("'", "") + "_df.csv")

    df = pd.DataFrame(columns=["BACTERIA", "CHANGED_BACTERIA", "CHANGE", "Y"])
    df.to_csv(df_title, index=False)

    # create a df that saves a binary value 1/0 => interaction/no interaction according to the train set
    train_binary_significant_df = pd.DataFrame(columns=bacteria)
    # create a df that saves the continuous b value of each bacteria according to the test set
    test_b_df = pd.DataFrame(columns=bacteria)

    for b_i, bacteria_num in enumerate(
            bacteria_number_list):  # for each bacteria
        df = pd.read_csv(df_title)
        path = "X_y_for_bacteria_number_" + str(b_i) + ".csv"
        X_trains, X_tests, y_trains, y_tests, name = \
            get_adapted_X_y_for_wanted_learning_task(tax, path, "regular", k_fold=1)
        X_train, X_test, y_train, y_test = X_trains[0], X_tests[0], y_trains[
            0], y_tests[0]
        NUMBER_OF_SAMPLES = X_train.shape[0]
        NUMBER_OF_BACTERIA = X_train.shape[1]
        NUMBER_OF_TIME_POINTS = None
        missing_values = np.array([1 for j in range(NUMBER_OF_SAMPLES)])

        # split to train and test
        """
        split_list = [1 - test_size, test_size]
        split_list = np.multiply(np.cumsum(split_list), len(X)).astype("int").tolist()

        # list of shuffled indices to sample randomly
        shuffled_idx = []
        shuffle(person_indexes)
        for arr in person_indexes:
            for val in arr:
                shuffled_idx.append(val)

        # split the data itself
        X_train = X[shuffled_idx[:split_list[0]]]
        y_train = y[shuffled_idx[:split_list[0]]]

        X_test = X[shuffled_idx[split_list[0]:split_list[1]]]
        y_test = y[shuffled_idx[split_list[0]:split_list[1]]]
        """

        train_binary_significant_for_b_i = []
        test_1_u_score_for_b_i = []
        """
        path = "time_serie_X_y_for_bacteria_number_" + str(bacteria_num) + ".csv"
        X, y, missing_values, name = get_adapted_X_y_for_wanted_learning_task(tax, path, "time_serie")
        NUMBER_OF_SAMPLES = X.shape[0]
        NUMBER_OF_TIME_POINTS = X.shape[1]
        NUMBER_OF_BACTERIA = X.shape[2]

        flat_time_points_values_num = NUMBER_OF_SAMPLES * NUMBER_OF_TIME_POINTS

        X = X.reshape(flat_time_points_values_num, NUMBER_OF_BACTERIA)
        y = y.reshape(flat_time_points_values_num)
        missing_values = missing_values.reshape(flat_time_points_values_num)

        person_indexes = np.linspace(0, flat_time_points_values_num - 1, flat_time_points_values_num). \
            reshape(NUMBER_OF_SAMPLES, NUMBER_OF_TIME_POINTS).astype(int).tolist()
        """
        # TRAIN
        # run the model one time with no change, then save it
        res_map = run_NN(X_train,
                         y_train,
                         missing_values,
                         params,
                         name,
                         folder,
                         NUMBER_OF_SAMPLES,
                         NUMBER_OF_TIME_POINTS,
                         NUMBER_OF_BACTERIA,
                         save_model=True,
                         GPU_flag=GPU,
                         k_fold=k_fold,
                         task_id="base_" + str(b_i) + "_model",
                         person_indexes=None)

        model_path = os.path.join(
            folder, "trained_models",
            params.__str__().replace(" ", "").replace("'", "") + "_base_" +
            str(b_i) + "_model_model")
        out_dim = 1 if len(y_train.shape) == 1 else y_train.shape[
            1]  # else NUMBER_OF_BACTERIA
        structure = params["STRUCTURE"]
        layer_num = int(structure[0:3])
        hid_dim_1 = int(structure[4:7])
        hid_dim_2 = int(structure[8:11]) if len(structure) > 10 else None

        clf_params = {
            "NN_input_dim": X_train.shape[1],
            "NN_hidden_dim_1": hid_dim_1,
            "NN_output_dim": out_dim
        }

        clf = bacteria_network_clf(clf_params)
        clf.load(model_path)
        y_pred_no_change = clf.predict(torch.FloatTensor(X_train))
        y_str = ""
        for val in y_pred_no_change:
            y_str += str(val.detach().numpy()[0]) + " "
        df.loc[len(df)] = [int(bacteria_num), int(-1), "no change", y_str]

        # ------------------------------------ send to network ------------------------------------
        for bacteria_to_change_num in bacteria_number_list:  # change each bacteria

            # change X, y for only bacteria_to_change_num
            X_positive_change = copy.deepcopy(X_train)
            for s_i, sample in enumerate(
                    X_positive_change
            ):  # 0.9459053900000001 -0.05409460999999999
                X_positive_change[s_i][bacteria_to_change_num] += CHANGE

            y_pred_pos_change = clf.predict(
                torch.FloatTensor(X_positive_change))
            y_str = ""
            for val in y_pred_pos_change:
                y_str += str(val.detach().numpy()[0]) + " "

            df.loc[len(df)] = [
                int(bacteria_num),
                int(bacteria_to_change_num), "plus " + str(CHANGE), y_str
            ]

            X_negative_change = copy.deepcopy(X_train)
            for s_i, sample in enumerate(X_negative_change):
                X_negative_change[s_i][bacteria_to_change_num] -= CHANGE

            y_pred_neg_change = clf.predict(
                torch.FloatTensor(X_negative_change))
            y_str = ""
            for val in y_pred_neg_change:
                y_str += str(val.detach().numpy()[0]) + " "

            df.loc[len(df)] = [
                int(bacteria_num),
                int(bacteria_to_change_num), "minus " + str(CHANGE), y_str
            ]

            pos_u, pos_u_test_p_val = mannwhitneyu(
                y_pred_no_change.detach().numpy(),
                y_pred_pos_change.detach().numpy())
            neg_u, neg_u_test_p_val = mannwhitneyu(
                y_pred_no_change.detach().numpy(),
                y_pred_neg_change.detach().numpy())

            if pos_u_test_p_val < p_value and neg_u_test_p_val < p_value:
                train_binary_significant_for_b_i.append(1)
            else:
                train_binary_significant_for_b_i.append(0)

        # TEST
        # run the model one time with no change, then save it
        NUMBER_OF_SAMPLES = X_test.shape[0]
        NUMBER_OF_BACTERIA = X_test.shape[1]
        NUMBER_OF_TIME_POINTS = None
        missing_values = np.array([1 for j in range(NUMBER_OF_SAMPLES)])
        res_map = run_NN(X_test,
                         y_test,
                         missing_values,
                         params,
                         name,
                         folder,
                         NUMBER_OF_SAMPLES,
                         NUMBER_OF_TIME_POINTS,
                         NUMBER_OF_BACTERIA,
                         save_model=True,
                         GPU_flag=GPU,
                         k_fold=k_fold,
                         task_id="base_" + str(b_i) + "_model",
                         person_indexes=None)

        model_path = os.path.join(
            folder, "trained_models",
            params.__str__().replace(" ", "").replace("'", "") + "_base_" +
            str(b_i) + "_model_model")
        out_dim = 1 if len(
            y_test.shape) == 1 else y_test.shape[1]  # else NUMBER_OF_BACTERIA
        structure = params["STRUCTURE"]
        layer_num = int(structure[0:3])
        hid_dim_1 = int(structure[4:7])
        hid_dim_2 = int(structure[8:11]) if len(structure) > 10 else None

        clf_params = {
            "NN_input_dim": X_test.shape[1],
            "NN_hidden_dim_1": hid_dim_1,
            "NN_output_dim": out_dim
        }

        clf = bacteria_network_clf(clf_params)
        clf.load(model_path)
        y_pred_no_change = clf.predict(torch.FloatTensor(X_test))
        # ------------------------------------ send to network ------------------------------------
        for bacteria_to_change_num in bacteria_number_list:  # change each bacteria

            # change X, y for only bacteria_to_change_num
            X_positive_change = copy.deepcopy(X_test)
            for s_i, sample in enumerate(X_positive_change):
                X_positive_change[s_i][bacteria_to_change_num] += CHANGE

            y_pred_pos_change = clf.predict(
                torch.FloatTensor(X_positive_change))

            X_negative_change = copy.deepcopy(X_test)
            for s_i, sample in enumerate(X_negative_change):
                X_negative_change[s_i][bacteria_to_change_num] -= CHANGE

            y_pred_neg_change = clf.predict(
                torch.FloatTensor(X_negative_change))

            pos_u, pos_u_test_p_val = mannwhitneyu(
                y_pred_no_change.detach().numpy(),
                y_pred_pos_change.detach().numpy())
            neg_u, neg_u_test_p_val = mannwhitneyu(
                y_pred_no_change.detach().numpy(),
                y_pred_neg_change.detach().numpy())

            test_1_u_score_for_b_i.append((1 / pos_u, 1 / neg_u))

        # save bacteria b_i results
        df.to_csv(df_title, index=False)
        train_binary_significant_df.loc[len(
            train_binary_significant_df)] = train_binary_significant_for_b_i
        test_b_df.loc[len(test_b_df)] = test_1_u_score_for_b_i
    # calculate AUC on the flatten data frame
    # positive change tuple[0]
    pos_b = []
    neg_b = []
    for row in test_b_df.values:
        for val in row:
            pos_b.append(float(val[0]))
            neg_b.append(float(val[1]))
    pos_b = np.array(pos_b)
    neg_b = np.array(neg_b)

    train_binary_significant_values = []
    for val in np.array(train_binary_significant_df.values).flatten():
        train_binary_significant_values.append(val)

    train_binary_significant_values = np.array(train_binary_significant_values)
    try:
        pos_auc = roc_auc_score(train_binary_significant_values, pos_b)
        neg_auc = roc_auc_score(train_binary_significant_values, neg_b)

        Networks_AUC_df = pd.read_csv("all_Networks_AUC.csv")
        Networks_AUC_df.loc[len(Networks_AUC_df)] = [
            "positive change", "neural network " + params.__str__(),
            data_set_name, test_size, k_fold, pos_auc,
            datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
        ]
        Networks_AUC_df.loc[len(Networks_AUC_df)] = [
            "negative change", "neural network " + params.__str__(),
            data_set_name, test_size, k_fold, neg_auc,
            datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
        ]
        Networks_AUC_df.to_csv("all_Networks_AUC.csv", index=False)

    except:
        print(
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        )
        print(len(train_binary_significant_values))
        print(set(train_binary_significant_values))
        print(len(pos_b))
        print(len(set(pos_b)))
        print(len(neg_b))
        print(len(set(neg_b)))
        print(
            "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
        )
예제 #7
0
    def run_regression_coef_net(self, reg_type, k_fold, test_size):
        task = "interaction_network_structure_coef"
        X_y_files_path = pickle.load(
            open(
                os.path.join(self.load_and_save_path,
                             self.reg_X_y_files_pkl_path), "rb"))
        all_times_all_bact_results_path = os.path.join(
            self.load_and_save_path,
            reg_type.replace(" ", "_") + "_" + task + "_" + str(k_fold) +
            "_fold_test_size_" + str(test_size) + "_results_df.csv")
        important_bacteria_reults_path = os.path.join(
            self.load_and_save_path,
            reg_type.replace(" ", "_") + "_" + task + "_" + str(k_fold) +
            "_fold_test_size_" + str(test_size) +
            "_significant_bacteria_prediction_results_df.csv")

        with open(os.path.join(self.load_and_save_path, "bacteria.txt"),
                  "r") as b_file:
            bacteria = b_file.readlines()
            bacteria = [b.rstrip() for b in bacteria]

        create_data_frames(
            all_res_path=all_times_all_bact_results_path,
            important_bacteria_reults_path=important_bacteria_reults_path)

        with open(os.path.join(self.load_and_save_path, X_y_files_path),
                  "r") as file:
            paths = file.readlines()
            paths = [p.strip('\n') for p in paths]

        train_binary_significant_from_all_bacteria = []
        test_b_list_from_all_bacteria = []

        for i, [bact, path] in enumerate(zip(bacteria, paths)):
            print(str(i) + " / " + str(len(bacteria)))

            all_times_all_bacteria_all_models_results_df = pd.read_csv(
                all_times_all_bact_results_path)
            important_bacteria_reults_df = pd.read_csv(
                important_bacteria_reults_path)
            X_trains, X_tests, y_trains, y_tests, name = \
                get_adapted_X_y_for_wanted_learning_task(self.load_and_save_path, path, "regular", k_fold, test_size)

            results_df, train_binary_significant_list, test_b_list = \
                predict_interaction_network_structure_using_coeffs(X_trains, X_tests, y_trains, y_tests, i,
                                                                   all_times_all_bacteria_all_models_results_df,
                                                                   all_times_all_bact_results_path,
                                                                   important_bacteria_reults_df,
                                                                   important_bacteria_reults_path, bact, bacteria,
                                                                   reg_type)
            # save bacteria y true nd y pred
            train_binary_significant_from_all_bacteria.append(
                list(np.array(train_binary_significant_list).flat))
            test_b_list_from_all_bacteria.append(
                list(np.array(test_b_list).flat))

        train_binary_significant_from_all_bacteria = list(
            np.array(train_binary_significant_from_all_bacteria).flat)
        test_b_list_from_all_bacteria = list(
            np.array(test_b_list_from_all_bacteria).flat)
        total_auc = roc_auc_score(
            y_true=train_binary_significant_from_all_bacteria,
            y_score=test_b_list_from_all_bacteria)

        Networks_AUC_df = pd.read_csv("all_Networks_AUC.csv")
        data_set = self.load_and_save_path.split(os.path.sep)[0]
        Networks_AUC_df.loc[len(Networks_AUC_df)] = [
            "coefficients", reg_type, data_set, test_size, k_fold, total_auc,
            datetime.utcnow().strftime("%d/%m/%Y %H:%M:%S")
        ]
        Networks_AUC_df.to_csv("all_Networks_AUC.csv", index=False)