Ejemplo n.º 1
0
def main(use_censored=USE_CENSORED,
         use_similarity=USE_SIMILARITY,
         grid_results_folder='grid_search_no_censored'):

    print(f'use censored: {use_censored}, use similarity: {use_similarity}')

    if not use_recorded:
        x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf = prep_data(
            n_components)
    else:
        x_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "rb"))
        y_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "rb"))
        x_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "rb"))
        y_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "rb"))
        censored_data = pickle.load(
            open(os.path.join(SCRIPT_DIR, "censored_data.p"), "rb"))
        not_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "not_censored.p"), "rb"))
        otu_after_pca_wo_taxonomy = pickle.load(
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "rb"))
        OtuMf = pickle.load(open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "rb"))

    if record_inputs:
        pickle.dump(x_for_deep,
                    open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "wb"))
        pickle.dump(y_for_deep,
                    open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "wb"))
        pickle.dump(
            x_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "wb"))
        pickle.dump(
            y_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "wb"))
        pickle.dump(censored_data,
                    open(os.path.join(SCRIPT_DIR, "censored_data.p"), "wb"))
        pickle.dump(not_censored,
                    open(os.path.join(SCRIPT_DIR, "not_censored.p"), "wb"))
        pickle.dump(
            otu_after_pca_wo_taxonomy,
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "wb"))
        pickle.dump(OtuMf, open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "wb"))

    if use_similarity:
        betas_list = [1, 10, 100]
    else:
        betas_list = [
            None
        ]  # just a list of one element so that the for loop will run only once

    for beta in betas_list:
        censored_mse_fraction_factor = None

        if use_censored:
            y_for_deep_censored['mse_coeff'] = 0

        if use_similarity:
            censored_mse_fraction_factor = 2

            ##### Similarity algo ####
            not_censored_for_similarity = not_censored.join(
                otu_after_pca_wo_taxonomy)

            censored_data_with_time = compute_time_for_censored_using_similarity_matrix(
                not_censored_for_similarity,
                censored_data,
                n_components,
                OtuMf,
                otu_after_pca_wo_taxonomy,
                beta=beta,
                remove_outliers=True,
                th_value=None)

            # combine the x_censored and the syntethic time
            x_for_deep_censored['time_for_the_event'][censored_data_with_time['time_for_the_event'].index] = \
                censored_data_with_time['time_for_the_event']
            y_for_deep_censored['delta_time'][
                censored_data_with_time['time_for_the_event'].
                index] = censored_data_with_time['time_for_the_event']

            # change the MSE coeff for the last sample of censored (its just prep, the actual value will be set within the algo)
            y_for_deep_censored['mse_coeff'][censored_data_with_time[
                'time_for_the_event'].index] = 'last_censored'

            ##### END Similarity algo ####

        starting_col = np.argwhere(x_for_deep.columns == 0).tolist()[0][0]
        X = x_for_deep.iloc[:, starting_col:starting_col + n_components]
        y = y_for_deep  # ['delta_time']

        starting_col = np.argwhere(
            x_for_deep_censored.columns == 0).tolist()[0][0]
        X_train_censored = x_for_deep_censored.iloc[:,
                                                    starting_col:starting_col +
                                                    n_components]
        y_train_censored = y_for_deep_censored
        number_samples_censored = y_train_censored.shape[0]
        print(f'Number of censored subjects: {number_samples_censored}')

        # remove outliers
        before_removal = y.shape[0]
        std = y['delta_time'].values.std()
        th = std * 5

        outlier_mask = y['delta_time'] < th
        y = y.loc[outlier_mask]
        X = X.loc[outlier_mask]

        after_removal = y.shape[0]
        print(f'{before_removal-after_removal} outlier/s were removed')

        stats_input(y, y_train_censored)

        PLOT_INPUT_TO_NN_STATS = False
        if PLOT_INPUT_TO_NN_STATS:
            plt.hist(y['delta_time'].values, bins=150)
            b = y['delta_time'].values.copy()
            b.sort()
            med = b[int(len(b) / 2)]
            std = y['delta_time'].values.std()
            mean = y['delta_time'].values.mean()

            plt.title(f'STD={std}, MED={med}, Mean={mean}')

        epochs_list = [
            20, 80
        ]  #[10, 50, 100] #list(range(10,100,20)) + list(range(100,200,30))
        mse_factor_list = [0.1, 1, 10, 100, 1000]  # np.arange(0.005, 1, 0.005)

        if not use_similarity:
            # mse_factor_list = [1]
            if not use_censored:
                mse_factor_list = [1]
                X_train_censored = None
                y_train_censored = None

        dropout_list = [0, 0.2, 0.6]  #np.arange(0, 0.8, 0.1)
        l2_lambda_list = [1, 10, 20, 100]
        #np.logspace(0, 2, 5) #  0.01, 0.1, 1, 10, 100
        number_layers_list = [1, 2, 3]
        number_neurons_per_layer_list = [20, 50]
        epochs_list = [
            1000
        ]  #[10, 50, 100] #list(range(10,100,20)) + list(range(100,200,30))

        best_config = 'l2=1^dropout=0.2^factor=1^epochs=1000^number_iterations=5^number_layers=1^neurons_per_layer=20'
        l2_lambda_list = [1]
        dropout_list = [0.2]
        number_layers_list = [2]
        number_neurons_per_layer_list = [50]

        train_res, test_res = time_series_analysis_tf(
            X,
            y,
            n_components,
            l2_lambda_list,
            dropout_list,
            mse_factor_list,
            number_layers_list,
            number_neurons_per_layer_list,
            epochs_list,
            cross_val_number=10,
            X_train_censored=X_train_censored,
            y_train_censored=y_train_censored,
            record=RECORD,
            grid_search_dir=grid_results_folder,
            beta_for_similarity=beta,
            censored_mse_fraction_factor=censored_mse_fraction_factor)

    total_num_of_configs = len(dropout_list) *\
                               len(l2_lambda_list) *\
                               len(number_layers_list) *\
                               len(number_neurons_per_layer_list) *\
                               len(betas_list)
    print(
        f'Total number of configuration that were checked: {total_num_of_configs}'
    )
Ejemplo n.º 2
0
def main(use_similarity=USE_SIMILARITY,
         grid_results_folder='grid_search_xgboost_with_censored'):
    if not use_recorded:
        x_for_deep, y_for_deep, x_for_deep_censored, y_for_deep_censored, censored_data, not_censored, otu_after_pca_wo_taxonomy, OtuMf = prep_data(
            n_components)
    else:
        x_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "rb"))
        y_for_deep = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "rb"))
        x_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "rb"))
        y_for_deep_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "rb"))
        censored_data = pickle.load(
            open(os.path.join(SCRIPT_DIR, "censored_data.p"), "rb"))
        not_censored = pickle.load(
            open(os.path.join(SCRIPT_DIR, "not_censored.p"), "rb"))
        otu_after_pca_wo_taxonomy = pickle.load(
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "rb"))
        OtuMf = pickle.load(open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "rb"))

    if record_inputs:
        pickle.dump(x_for_deep,
                    open(os.path.join(SCRIPT_DIR, "x_for_deep.p"), "wb"))
        pickle.dump(y_for_deep,
                    open(os.path.join(SCRIPT_DIR, "y_for_deep.p"), "wb"))
        pickle.dump(
            x_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "x_for_deep_censored.p"), "wb"))
        pickle.dump(
            y_for_deep_censored,
            open(os.path.join(SCRIPT_DIR, "y_for_deep_censored.p"), "wb"))
        pickle.dump(censored_data,
                    open(os.path.join(SCRIPT_DIR, "censored_data.p"), "wb"))
        pickle.dump(not_censored,
                    open(os.path.join(SCRIPT_DIR, "not_censored.p"), "wb"))
        pickle.dump(
            otu_after_pca_wo_taxonomy,
            open(os.path.join(SCRIPT_DIR, "otu_after_pca_wo_taxonomy.p"),
                 "wb"))
        pickle.dump(OtuMf, open(os.path.join(SCRIPT_DIR, "OtuMf.p"), "wb"))

    if use_similarity:
        betas_list = [1, 10, 100]
    else:
        betas_list = [
            None
        ]  # just a list of one element so that the for loop will run only once
    for beta in betas_list:
        if use_similarity:
            ##### Similarity algo ####
            not_censored_for_similarity = not_censored.join(
                otu_after_pca_wo_taxonomy)

            censored_data_with_time = compute_time_for_censored_using_similarity_matrix(
                not_censored_for_similarity,
                censored_data,
                n_components,
                OtuMf,
                otu_after_pca_wo_taxonomy,
                beta=beta,
                remove_outliers=True,
                th_value=None)

            # combine the x_censored and the syntethic time
            x_for_deep_censored['time_for_the_event'][censored_data_with_time['time_for_the_event'].index] = \
                censored_data_with_time['time_for_the_event']
            y_for_deep_censored['delta_time'][
                censored_data_with_time['time_for_the_event'].
                index] = censored_data_with_time['time_for_the_event']

            # change the MSE coeff for the last sample of censored
            y_for_deep_censored['mse_coeff'][
                censored_data_with_time['time_for_the_event'].index] = 5

            ##### END Similarity algo ####

        starting_col = np.argwhere(x_for_deep.columns == 0).tolist()[0][0]
        X = x_for_deep.iloc[:, starting_col:starting_col + n_components]
        y = y_for_deep['delta_time']

        starting_col = np.argwhere(
            x_for_deep_censored.columns == 0).tolist()[0][0]
        X_train_censored = x_for_deep_censored.iloc[:,
                                                    starting_col:starting_col +
                                                    n_components]
        y_train_censored = y_for_deep_censored['delta_time']
        number_samples_censored = y_train_censored.shape[0]
        print(f'Number of censored subjects: {number_samples_censored}')

        if REMOVE_OUTLIERS:
            # remove outliers
            before_removal = y.shape[0]
            std = y.values.std()
            th = std * 5

            outlier_mask = y < th
            y = y.loc[outlier_mask]
            X = X.loc[outlier_mask]

            after_removal = y.shape[0]
            print(f'{before_removal - after_removal} outlier/s were removed')

        alpha_list = [0.01, 20, 50, 100]
        n_estimators_list = [5, 10, 20]
        min_child_weight_list = [0.1, 1, 10, 20]
        reg_lambda_list = [0, 10, 20]
        max_depth_list = [3, 5, 10]

        # alpha_list = [0.01]
        # n_estimators_list = [5]
        # min_child_weight_list = [0.1]
        # reg_lambda_list = [0]
        # max_depth_list = [3]

        if not use_similarity:
            X_train_censored = None
            y_train_censored = None

        train_res, test_res = time_series_using_xgboost(
            X,
            y,
            alpha_list,
            n_estimators_list,
            min_child_weight_list,
            reg_lambda_list,
            max_depth_list,
            cross_val_number=5,
            X_train_censored=X_train_censored,
            y_train_censored=y_train_censored,
            record=RECORD,
            grid_search_dir=grid_results_folder,
            deep_verbose=False,
            beta_for_similarity=beta,
            use_random_time=True)


    total_num_of_configs = len(alpha_list) *\
                           len(n_estimators_list) *\
                           len(min_child_weight_list) *\
                           len(reg_lambda_list) *\
                           len(max_depth_list) *\
                           len(betas_list)
    print(
        f'Total number of configuration that were checked: {total_num_of_configs}'
    )