예제 #1
0
def main(use_censored=USE_CENSORED, use_similarity=USE_SIMILARITY, grid_results_folder='rnn_grid_search_no_censored'):
    if not use_recorded:
        x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf = prep_data(n_components)
    else:
        x_for_deep = pickle.load(open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "rb"))
        y_for_deep = pickle.load(open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "rb"))
        x_for_deep_censored = pickle.load(open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "rb"))
        y_for_deep_censored = pickle.load(open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "rb"))
        censored_data = pickle.load(open(os.path.join(SCRIPT_DIR, "censored_data.p"), "rb"))
        not_censored = pickle.load(open(os.path.join(SCRIPT_DIR, "not_censored.p"), "rb"))
        otu_after_pca_wo_taxonomy = pickle.load(open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"), "rb"))
        OtuMf = pickle.load(open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "rb"))

    if record_inputs:
        pickle.dump(x_for_deep, open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "wb"))
        pickle.dump(y_for_deep, open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "wb"))
        pickle.dump(x_for_deep_censored, open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "wb"))
        pickle.dump(y_for_deep_censored, open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "wb"))
        pickle.dump(censored_data, open(os.path.join(SCRIPT_DIR, "censored_data.p"), "wb"))
        pickle.dump(not_censored, open(os.path.join(SCRIPT_DIR, "not_censored.p"), "wb"))
        pickle.dump(otu_after_pca_wo_taxonomy, open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"), "wb"))
        pickle.dump(OtuMf, open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "wb"))

    if use_similarity:
        betas_list = [1, 10, 100]
    else:
        betas_list = [None]  # just a list of one element so that the for loop will run only once

    for beta in betas_list:
        censored_mse_fraction_factor = None

        if use_censored:
            y_for_deep_censored['mse_coeff'] = 0

        if use_similarity:
            censored_mse_fraction_factor = 2

            ##### Similarity algo ####
            not_censored_for_similarity = not_censored.join(otu_after_pca_wo_taxonomy)

            censored_data_with_time = compute_time_for_censored_using_similarity_matrix(not_censored_for_similarity,
                                                                                        censored_data,
                                                                                        n_components,
                                                                                        OtuMf,
                                                                                        otu_after_pca_wo_taxonomy,
                                                                                        beta=beta,
                                                                                        remove_outliers=True,
                                                                                        th_value=None)

            # combine the x_censored and the syntethic time
            x_for_deep_censored['time_for_the_event'][censored_data_with_time['time_for_the_event'].index] = \
                censored_data_with_time['time_for_the_event']
            y_for_deep_censored['delta_time'][censored_data_with_time['time_for_the_event'].index] = censored_data_with_time['time_for_the_event']

            # change the MSE coeff for the last sample of censored (its just prep, the actual value will be set within the algo)
            y_for_deep_censored['mse_coeff'][censored_data_with_time['time_for_the_event'].index] = 'last_censored'

            ##### END Similarity algo ####


        starting_col = np.argwhere(x_for_deep.columns == 0).tolist()[0][0]
        X = x_for_deep.iloc[:, starting_col:starting_col + n_components]
        X['groupby'] = x_for_deep['PatientNumber210119']
        y = y_for_deep  # ['delta_time']

        starting_col = np.argwhere(x_for_deep_censored.columns == 0).tolist()[0][0]
        X_train_censored = x_for_deep_censored.iloc[:, starting_col:starting_col + n_components]
        X_train_censored['groupby'] = x_for_deep_censored['PatientNumber210119']
        y_train_censored = y_for_deep_censored
        number_samples_censored = y_train_censored.shape[0]
        print(f'Number of censored subjects: {number_samples_censored}')

        # remove outliers
        before_removal = y.shape[0]
        std = y['delta_time'].values.std()
        th = std * 5

        outlier_mask = y['delta_time'] < th
        y = y.loc[outlier_mask]
        X = X.loc[outlier_mask]

        after_removal = y.shape[0]
        print(f'{before_removal-after_removal} outlier/s were removed')

        stats_input(y, y_train_censored)

        PLOT_INPUT_TO_NN_STATS = False
        if PLOT_INPUT_TO_NN_STATS:
            plt.hist(y['delta_time'].values, bins=150)
            b = y['delta_time'].values.copy()
            b.sort()
            med = b[int(len(b)/2)]
            std = y['delta_time'].values.std()
            mean = y['delta_time'].values.mean()

            plt.title(f'STD={std}, MED={med}, Mean={mean}')

        epochs_list = [20, 100, 1000]#['MAX', 20, 100, 1000]
        mse_factor_list = [0.1, 10, 1000] # np.arange(0.005, 1, 0.005)

        if not use_similarity:
            if not use_censored:
                mse_factor_list = [1]
                X_train_censored = None
                y_train_censored = None



        l2_lambda_list = [1, 20]
        #np.logspace(0, 2, 5) #  0.01, 0.1, 1, 10, 100
        number_layers_list = [1, 2, 3]
        number_neurons_per_layer_list = [20, 50]

        l2_lambda_list = [0.1, 1, 10, 100]
        dropout_list = [0, 0.2, 0.6]  # np.arange(0, 0.8, 0.1)
        epochs_list = [1000]
        number_layers_list = [1, 2, 3]
        number_neurons_per_layer_list = [10, 30]

    train_res, test_res  = time_series_analysis_rnn(X, y,
                                                    n_components,
                                                    l2_lambda_list,
                                                    dropout_list,
                                                    mse_factor_list,
                                                    number_layers_list,
                                                    number_neurons_per_layer_list,
                                                    epochs_list,
                                                    cross_val_number=5,
                                                    X_train_censored=X_train_censored,
                                                    y_train_censored=y_train_censored,
                                                    record=RECORD,
                                                    grid_search_dir=grid_results_folder,
                                                    beta_for_similarity=beta,
                                                    censored_mse_fraction_factor=censored_mse_fraction_factor,
                                                    early_stop_fraction=None, #0.05,
                                                    min_epochs=30)

    total_num_of_configs = len(dropout_list) * \
                           len(l2_lambda_list) * \
                           len(number_layers_list) * \
                           len(number_neurons_per_layer_list) * \
                           len(betas_list)
    print(f'Total number of configuration that were checked: {total_num_of_configs}')
def main(use_similarity=USE_SIMILARITY,
         grid_results_folder='grid_search_xgboost_with_censored'):
    if not use_recorded:
        x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf = prep_data(
            n_components)
    else:
        x_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "rb"))
        y_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "rb"))
        x_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "rb"))
        y_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "rb"))
        censored_data = pickle.load(
            open(os.path.join(SCRIPT_DIR, "censored_data.p"), "rb"))
        not_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "not_censored.p"), "rb"))
        otu_after_pca_wo_taxonomy = pickle.load(
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "rb"))
        OtuMf = pickle.load(open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "rb"))

    if record_inputs:
        pickle.dump(x_for_deep,
                    open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "wb"))
        pickle.dump(y_for_deep,
                    open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "wb"))
        pickle.dump(
            x_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "wb"))
        pickle.dump(
            y_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "wb"))
        pickle.dump(censored_data,
                    open(os.path.join(SCRIPT_DIR, "censored_data.p"), "wb"))
        pickle.dump(not_censored,
                    open(os.path.join(SCRIPT_DIR, "not_censored.p"), "wb"))
        pickle.dump(
            otu_after_pca_wo_taxonomy,
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "wb"))
        pickle.dump(OtuMf, open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "wb"))

    if use_similarity:
        betas_list = [1, 10, 100]
    else:
        betas_list = [
            None
        ]  # just a list of one element so that the for loop will run only once
    for beta in betas_list:
        if use_similarity:
            ##### Similarity algo ####
            not_censored_for_similarity = not_censored.join(
                otu_after_pca_wo_taxonomy)

            censored_data_with_time = compute_time_for_censored_using_similarity_matrix(
                not_censored_for_similarity,
                censored_data,
                n_components,
                OtuMf,
                otu_after_pca_wo_taxonomy,
                beta=beta,
                remove_outliers=True,
                th_value=None)

            # combine the x_censored and the syntethic time
            x_for_deep_censored['time_for_the_event'][censored_data_with_time['time_for_the_event'].index] = \
                censored_data_with_time['time_for_the_event']
            y_for_deep_censored['delta_time'][
                censored_data_with_time['time_for_the_event'].
                index] = censored_data_with_time['time_for_the_event']

            # change the MSE coeff for the last sample of censored
            y_for_deep_censored['mse_coeff'][
                censored_data_with_time['time_for_the_event'].index] = 5

            ##### END Similarity algo ####

        starting_col = np.argwhere(x_for_deep.columns == 0).tolist()[0][0]
        X = x_for_deep.iloc[:, starting_col:starting_col + n_components]
        y = y_for_deep['delta_time']

        starting_col = np.argwhere(
            x_for_deep_censored.columns == 0).tolist()[0][0]
        X_train_censored = x_for_deep_censored.iloc[:,
                                                    starting_col:starting_col +
                                                    n_components]
        y_train_censored = y_for_deep_censored['delta_time']
        number_samples_censored = y_train_censored.shape[0]
        print(f'Number of censored subjects: {number_samples_censored}')

        alpha_list = [0.01, 20, 50, 100]
        n_estimators_list = [5, 10, 20]
        min_child_weight_list = [0.1, 1, 10, 20]
        reg_lambda_list = [0, 10, 20]
        max_depth_list = [3, 5, 10]

        # alpha_list = [0.01]
        # n_estimators_list = [5]
        # min_child_weight_list = [0.1]
        # reg_lambda_list = [0]
        # max_depth_list = [3]

        if not use_similarity:
            X_train_censored = None
            y_train_censored = None

        train_res, test_res = time_series_using_xgboost(
            X,
            y,
            alpha_list,
            n_estimators_list,
            min_child_weight_list,
            reg_lambda_list,
            max_depth_list,
            cross_val_number=5,
            X_train_censored=X_train_censored,
            y_train_censored=y_train_censored,
            record=RECORD,
            grid_search_dir=grid_results_folder,
            deep_verbose=False,
            beta_for_similarity=beta,
            use_random_time=True)


    total_num_of_configs = len(alpha_list) *\
                           len(n_estimators_list) *\
                           len(min_child_weight_list) *\
                           len(reg_lambda_list) *\
                           len(max_depth_list) *\
                           len(betas_list)
    print(
        f'Total number of configuration that were checked: {total_num_of_configs}'
    )