Beispiel #1
0
def create_and_save_partitions(dataset,
                               study_name,
                               meta_label,
                               test_groups,
                               pretest_groups,
                               valid_groups,
                               save_text_files=True):

    # determine dataset orientation
    orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat'

    # discard null categories
    tobediscarded = np.in1d(
        dataset.rowmeta[meta_label],
        ['-666', '', 'NA', 'N/A', 'na', 'n/a', 'NaN', 'NAN', 'nan'])
    dataset.discard(tobediscarded, 0)
    print('discarding {0!s} samples...'.format(tobediscarded.sum()),
          flush=True)
    print(dataset, flush=True)

    # partition the data
    tobepopped = np.in1d(dataset.rowmeta[meta_label], test_groups)
    dataset_test = dataset.pop(tobepopped, 0)
    print('    TEST', flush=True)
    print(dataset_test, flush=True)
    tobepopped = np.in1d(dataset.rowmeta[meta_label], pretest_groups)
    dataset_pretest = dataset.pop(tobepopped, 0)
    print('    PRETEST', flush=True)
    print(dataset_pretest, flush=True)
    tobepopped = np.in1d(dataset.rowmeta[meta_label], valid_groups)
    dataset_valid = dataset.pop(tobepopped, 0)
    print('    VALID', flush=True)
    print(dataset_valid, flush=True)
    dataset_train = dataset
    print('    TRAIN', flush=True)
    print(dataset_train, flush=True)

    # save data partitions
    savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation)
    print('    SAVING PARTITIONS TO {0}'.format(savefolder), flush=True)
    os.makedirs(savefolder)
    datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder),
                              dataset_test)
    datasetIO.save_datamatrix('{0}/pretest.pickle'.format(savefolder),
                              dataset_pretest)
    datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder),
                              dataset_valid)
    datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder),
                              dataset_train)
    if save_text_files:
        os.mkdir('{0}/test'.format(savefolder))
        datasetIO.save_splitdata('{0}/test'.format(savefolder), dataset_test)
        os.mkdir('{0}/pretest'.format(savefolder))
        datasetIO.save_splitdata('{0}/pretest'.format(savefolder),
                                 dataset_pretest)
        os.mkdir('{0}/valid'.format(savefolder))
        datasetIO.save_splitdata('{0}/valid'.format(savefolder), dataset_valid)
        os.mkdir('{0}/train'.format(savefolder))
        datasetIO.save_splitdata('{0}/train'.format(savefolder), dataset_train)
def main(project_name, hyperparameters, evaluation_statistics, selection_criteria, sigma_multipliers):
    
    min_num_hp_combinations = 100
    num_gp_optimizer_restarts = 0 # 4
    outlier_sigma_multiplier = 6
    
    xline = np.linspace(0, 1, 100, dtype='float64')
    yline = np.linspace(0, 1, 100, dtype='float64')
    xmat, ymat = np.meshgrid(xline, yline)
    Xarr = np.append(xmat.reshape(-1,1), ymat.reshape(-1,1), 1)
    fxy = 2*Xarr[:,0]*Xarr[:,1]/(Xarr[:,0] + Xarr[:,1] + 1e-6)
    si = np.argsort(fxy)
    fxy = fxy[si]
    Xarr = Xarr[si,:]
    grid_indices = np.argsort(si)
    
    kernel = SumKernel(WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-6, 1e3)), ProductKernel(ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-6, 1e3)), RBFKernel(length_scale=np.array([1.0, 1.0], dtype='float64'), length_scale_bounds=(1e-2, 1e2))))
    
    project_folder = '../../hp_search/{0}'.format(project_name)
    print('project: {0}...'.format(project_name), flush=True)
    print('project_folder: {0}...'.format(project_folder), flush=True)
    
    search_folders = ['{0}/{1}'.format(project_folder, f) for f in os.listdir(project_folder) if f[:10] == 'hp_search_']
    search_ids = [int(f.rsplit('_', maxsplit=1)[-1]) for f in search_folders]
    print('found {0!s} search folders.'.format(len(search_folders)), flush=True)
        
    for search_id, search_folder in zip(search_ids, search_folders):
        print('working on search_folder: {0}...'.format(search_folder), flush=True)
        search_data_path = '{0}/hp_search_data.txt'.format(search_folder)
        search_data_path_with_stats = '{0}/hp_search_data_with_performance_stats.txt'.format(search_folder)
        print('search_data_path: {0}'.format(search_data_path), flush=True)
        if os.path.exists(search_data_path) and os.path.getsize(search_data_path) > 0:
            print('loading search data...', flush=True)
            df = pd.read_table(search_data_path, index_col=False)
            if df.shape[0] >= min_num_hp_combinations:
                print('appending performance stats...', flush=True)
                if os.path.exists(search_data_path_with_stats) and os.path.getsize(search_data_path) > 0:
                    df = pd.read_table(search_data_path_with_stats, index_col=False)
                else:
                    for stage in ['validation', 'testing']:
                        print('working on {0} stage...'.format(stage), flush=True)
                        for rowidx, combination_id in enumerate(df.combination_id):
                            combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id)
                            performance_data_path = '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(combination_folder, stage)
                            if os.path.exists(performance_data_path):
                                stat_subset = datasetIO.load_datamatrix(performance_data_path)
                                if 'stat_mat' not in locals():
                                    stat_mat = np.full((df.shape[0], stat_subset.size), np.nan, dtype='float64')
                                    stat_cols = (stage + '_' + stat_subset.rowlabels.reshape(-1,1) + '_' + stat_subset.columnlabels.reshape(1,-1)).reshape(-1)
                                stat_mat[rowidx,:] = stat_subset.matrix.reshape(-1)
                        stat_df = pd.DataFrame(data=stat_mat, columns=stat_cols)
                        stat_df['combination_id'] = df.combination_id.values
                        df = df.set_index('combination_id').join(stat_df.set_index('combination_id')).reset_index()
                        del stat_mat, stat_cols, stat_df
                    df.to_csv(search_data_path_with_stats, sep='\t', index=False)
                
                if '{0}_search_domain'.format(hyperparameters[0]) not in df.columns:
                    df['{0}_search_domain'.format(hyperparameters[0])] = 0.5
                if '{0}_search_domain'.format(hyperparameters[1]) not in df.columns:
                    df['{0}_search_domain'.format(hyperparameters[1])] = 0.5
                if '{0}_model_space'.format(hyperparameters[0]) not in df.columns:
                    df['{0}_model_space'.format(hyperparameters[0])] = 1
                if '{0}_model_space'.format(hyperparameters[1]) not in df.columns:
                    df['{0}_model_space'.format(hyperparameters[1])] = 1
                
                for evaluation_statistic in evaluation_statistics:
                    print('working on performance evaluation statistic: {0}...'.format(evaluation_statistic), flush=True)
                    
                    C = df['combination_id'].values
                    Y_fit = df['validation_{0}_fit'.format(evaluation_statistic)].values
                    Y_fit = np.log10(Y_fit/(1-Y_fit))
                    Y_predict = df['validation_{0}_predict'.format(evaluation_statistic)].values
                    Y_predict = np.log10(Y_predict/(1-Y_predict))
                    Y_diff = Y_fit - Y_predict
                    X_1 = df['{0}_search_domain'.format(hyperparameters[0])].values
                    X_2 = df['{0}_search_domain'.format(hyperparameters[1])].values
                    keep = np.isfinite(np.concatenate((Y_fit.reshape(-1,1), Y_predict.reshape(-1,1), Y_diff.reshape(-1,1), X_1.reshape(-1,1), X_2.reshape(-1,1)), 1)).all(1)
                    C = C[keep]
                    Y_fit = Y_fit[keep]
                    Y_predict = Y_predict[keep]
                    Y_diff = Y_diff[keep]
                    X_1 = X_1[keep]
                    X_2 = X_2[keep]
                    X = np.append(X_1.reshape(-1,1), X_2.reshape(-1,1), 1)
                    
                    print('fitting Y_predict...', flush=True)
                    is_outlier = np.zeros(Y_predict.size, dtype='bool')
                    prev_outliers = -1
                    curr_outliers = 0
                    num_fits = 0
                    while curr_outliers - prev_outliers > 0 and not is_outlier.all():
                        gp_predict = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_predict[~is_outlier])
                        Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(X, return_std=True)
                        is_outlier = np.abs(Y_predict - Y_predict_hat_mean) > outlier_sigma_multiplier*Y_predict_hat_stdv
                        prev_outliers = curr_outliers
                        curr_outliers = is_outlier.sum()
                        num_fits += 1
                        print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True)
                    Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(Xarr, return_std=True)
                    plt.imsave('{0}/{1}_predict_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1]))
                    plt.imsave('{0}/{1}_predict_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1]))
                    
                    print('fitting Y_diff...', flush=True)
                    is_outlier = np.zeros(Y_diff.size, dtype='bool')
                    prev_outliers = -1
                    curr_outliers = 0
                    num_fits = 0
                    while curr_outliers - prev_outliers > 0 and not is_outlier.all():
                        gp_diff = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_diff[~is_outlier])
                        Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(X, return_std=True)
                        is_outlier = np.abs(Y_diff - Y_diff_hat_mean) > outlier_sigma_multiplier*Y_diff_hat_stdv
                        prev_outliers = curr_outliers
                        curr_outliers = is_outlier.sum()
                        num_fits += 1
                        print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True)
                    Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(Xarr, return_std=True)
                    plt.imsave('{0}/{1}_diff_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1]))
                    plt.imsave('{0}/{1}_diff_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1]))

                    for selection_criterion in selection_criteria:
                        print('working on selection criterion: {0}...'.format(selection_criterion), flush=True)
                        
                        for sigma_multiplier in sigma_multipliers:
                            print('working on sigma multiplier: {0}...'.format(sigma_multiplier), flush=True)
                            
                            if selection_criterion == 'optimistic_max':
                                # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean.max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean()
                                hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)
                                # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit]))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean()
                                hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6))
                                if not hit2.any():
                                    hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6))
                                hit = hit2
                                # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][-1]
                            elif selection_criterion == 'conservative_max':
                                # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean.max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean()
                                hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)
                                # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit]))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean()
                                hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6))
                                if not hit2.any():
                                    hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6))
                                hit = hit2
                                # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][0]
                            elif selection_criterion == 'optimistic_match':
                                # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean()
                                hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                if not hit.any():
                                    hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean()
                                hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6))
                                # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][-1]
                            elif selection_criterion == 'conservative_match':
                                # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean()
                                hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                if not hit.any():
                                    hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean()
                                hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6))
                                # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][0]
                            elif selection_criterion == 'optimistic_max_0':
                                # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean.max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean()
                                hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)
                                # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][-1]
                            elif selection_criterion == 'conservative_max_0':
                                # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean
                                Y_predict_hat_mean_max = Y_predict_hat_mean.max()
                                Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean()
                                hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)
                                # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][0]
                            elif selection_criterion == 'optimistic_match_0':
                                # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean()
                                hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6
                                if not hit.any():
                                    hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][-1]
                            elif selection_criterion == 'conservative_match_0':
                                # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean
                                Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean))
                                Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean()
                                hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6
                                if not hit.any():
                                    hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)
                                # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***)
                                fxy_max = fxy[hit].max()
                                hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6)
                                hidx = hit.nonzero()[0][0]
                            else:
                                raise ValueError('invalid selection_criterion')
                            
                            X_1_hit, X_2_hit = Xarr[hidx,:]
                            d2 = (df['{0}_search_domain'.format(hyperparameters[0])].values - X_1_hit)**2 + (df['{0}_search_domain'.format(hyperparameters[1])].values - X_2_hit)**2
                            selidx = np.argmin(d2)
                            combination_id = df['combination_id'][selidx]
                            combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id)
                            selected_df = df[df.combination_id == combination_id].copy()
                            selected_df['search_id'] = search_id
                            selected_df['evaluation_statistic'] = evaluation_statistic
                            selected_df['selection_criterion'] = selection_criterion
                            selected_df['sigma_multiplier'] = sigma_multiplier
                            selected_df['Y_diff_hat_stdv_min'] = Y_diff_hat_stdv_min
                            selected_df['Y_diff_hat_mean_min'] = Y_diff_hat_mean_min
                            selected_df['Y_predict_hat_mean_max'] = Y_predict_hat_mean_max
                            selected_df['Y_predict_hat_stdv_max'] = Y_predict_hat_stdv_max
                            selected_df['Y_predict_hat_stdv_hit'] = Y_predict_hat_stdv[hidx]
                            selected_df['Y_predict_hat_mean_hit'] = Y_predict_hat_mean[hidx]
                            selected_df['Y_diff_hat_stdv_hit'] = Y_diff_hat_stdv[hidx]
                            selected_df['Y_diff_hat_mean_hit'] = Y_diff_hat_mean[hidx]
                            selected_df['X_1_hit'] = X_1_hit
                            selected_df['X_2_hit'] = X_2_hit
                            kernel_params = gp_predict.kernel_.get_params()
                            selected_df['kernel_noise_stdv'] = np.sqrt(kernel_params['k1__noise_level'])
                            selected_df['kernel_amplitude'] = kernel_params['k2__k1__constant_value']
                            selected_df['kernel_X_1_length_scale'], selected_df['kernel_X_2_length_scale'] = kernel_params['k2__k2__length_scale']
                            print('Y_predict_hat_mean_max: {0:1.3g}'.format(selected_df['Y_predict_hat_mean_max'].values[0]), flush=True)
                            print('Y_predict_hat_stdv_max: {0:1.3g}'.format(selected_df['Y_predict_hat_stdv_max'].values[0]), flush=True)
                            print('kernel_noise_stdv: {0:1.3g}'.format(selected_df['kernel_noise_stdv'].values[0]), flush=True)
                            print('kernel_amplitude: {0:1.3g}'.format(selected_df['kernel_amplitude'].values[0]), flush=True)
                            print('kernel_X_1_length_scale: {0:1.3g}'.format(selected_df['kernel_X_1_length_scale'].values[0]), flush=True)
                            print('kernel_X_2_length_scale: {0:1.3g}'.format(selected_df['kernel_X_2_length_scale'].values[0]), flush=True)
                            print('selected combination_id: {0!s}'.format(combination_id), flush=True)
                            print('selected combination_folder: {0}'.format(combination_folder), flush=True)
                            print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[0], selected_df['{0}_model_space'.format(hyperparameters[0])].values[0]), flush=True)
                            print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[1], selected_df['{0}_model_space'.format(hyperparameters[1])].values[0]), flush=True)
                            print('selected validation_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True)
                            print('selected validation_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True)
                            print('selected testing_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True)
                            print('selected testing_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True)
                            print('selected validation_ppv_fit: {0:1.3g}'.format(selected_df['validation_ppv_fit'].values[0]), flush=True)
                            print('selected validation_ppv_predict: {0:1.3g}'.format(selected_df['validation_ppv_predict'].values[0]), flush=True)
                            print('selected testing_ppv_fit: {0:1.3g}'.format(selected_df['testing_ppv_fit'].values[0]), flush=True)
                            print('selected testing_ppv_predict: {0:1.3g}'.format(selected_df['testing_ppv_predict'].values[0]), flush=True)
                            print('selected validation_tpr_fit: {0:1.3g}'.format(selected_df['validation_tpr_fit'].values[0]), flush=True)
                            print('selected validation_tpr_predict: {0:1.3g}'.format(selected_df['validation_tpr_predict'].values[0]), flush=True)
                            print('selected testing_tpr_fit: {0:1.3g}'.format(selected_df['testing_tpr_fit'].values[0]), flush=True)
                            print('selected testing_tpr_predict: {0:1.3g}'.format(selected_df['testing_tpr_predict'].values[0]), flush=True)
                            
                            feature_weights_path = '{0}/iter_feature_datamatrix.txt.gz'.format(combination_folder)
                            if os.path.exists(feature_weights_path) and os.path.getsize(feature_weights_path) > 0:
                                iter_feature = datasetIO.load_datamatrix(feature_weights_path)
                                iter_feature.rowmeta[iter_feature.rowname] = iter_feature.rowlabels.copy()
                                iter_feature.rowmeta['combination_id'] = selected_df['combination_id'].values.copy()
                                iter_feature.rowmeta['search_id'] = selected_df['search_id'].values.copy()
                                iter_feature.rowmeta['evaluation_statistic'] = selected_df['evaluation_statistic'].values.copy()
                                iter_feature.rowmeta['selection_criterion'] = selected_df['selection_criterion'].values.copy()
                                iter_feature.rowmeta['sigma_multiplier'] = selected_df['sigma_multiplier'].values.copy()
                                iter_feature.rowname = 'combination_id|search_id|evaluation_statistic|selection_criterion|sigma_multiplier'
                                iter_feature.rowlabels = np.array(['{0!s}|{1!s}|{2}|{3}|{4!s}'.format(ci, si, es, sc, sm) for ci, si, es, sc, sm in zip(iter_feature.rowmeta['combination_id'], iter_feature.rowmeta['search_id'], iter_feature.rowmeta['evaluation_statistic'], iter_feature.rowmeta['selection_criterion'], iter_feature.rowmeta['sigma_multiplier'])], dtype='object')
                                if 'feature_weights_dm' not in locals():
                                    feature_weights_dm = iter_feature
                                else:
                                    feature_weights_dm.append(iter_feature, 0)
                                del iter_feature
                            
                            if 'collected_df' not in locals():
                                collected_df = selected_df
                            else:
                                collected_df = collected_df.append(selected_df, ignore_index=True)
                            del selected_df
                
            else:
                print('missing combination data for search_id {0!s}. there are only {1!s} combinations'.format(search_id, df.shape[0]), flush=True)

        else:
            print('missing search data for search_id {0!s}'.format(search_id), flush=True)
                
        if np.mod(search_id, 10) == 0:
            collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False)
            datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm)
            
    
    collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False)
    datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm)

    
    print('done select_hyperparameters_gp.py', flush=True)
Beispiel #3
0
def main(visualizations_path):

    # read visualizations
    print('reading visualizations...', flush=True)
    designpath_selectedstep = {}
    with open(visualizations_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        for line in fr:
            design_path, selected_step = [x.strip() for x in line.split('\t')]
            designpath_selectedstep[design_path] = int(selected_step)
    print('found {0!s} visualizations...'.format(len(designpath_selectedstep)),
          flush=True)

    # make visualizations
    print('making visualizations...', flush=True)
    for didx, (design_path,
               selected_step) in enumerate(designpath_selectedstep.items()):
        print('working on {0}...'.format(design_path), flush=True)
        print('selected step:{0!s}...'.format(selected_step), flush=True)

        # load design
        print('loading design...', flush=True)
        with open(design_path,
                  mode='rt',
                  encoding='utf-8',
                  errors='surrogateescape') as fr:
            d = json.load(fr)
        if 'apply_activation_to_embedding' not in d:  # for legacy code
            d['apply_activation_to_embedding'] = True
        if 'use_batchnorm' not in d:  # for legacy code
            d['use_batchnorm'] = False
        if 'skip_layerwise_training' not in d:  # for legacy code
            d['skip_layerwise_training'] = False
        phase = d['training_schedule'][-1]
        d['current_hidden_layer'] = phase['hidden_layer']
        d['current_finetuning_run'] = phase['finetuning_run']
        d['current_epochs'] = phase['epochs']

        # load data
        if didx == 0:
            print('loading data...', flush=True)
            partitions = ['train', 'valid', 'test']
            dataset = {}
            for partition in partitions:
                dataset[partition] = datasetIO.load_datamatrix(
                    '{0}/{1}.pickle'.format(d['input_path'], partition))

        # finish configuration
        print('finishing configuration...', flush=True)

        # specify activation function
        if d['activation_function'] == 'tanh':
            activation_function = {'np': sdae_apply_functions.tanh}
        elif d['activation_function'] == 'relu':
            activation_function = {'np': sdae_apply_functions.relu}
        elif d['activation_function'] == 'elu':
            activation_function = {'np': sdae_apply_functions.elu}
        elif d['activation_function'] == 'sigmoid':
            activation_function = {'np': sdae_apply_functions.sigmoid}

        # initialize model architecture (number of layers and dimension of each layer)
        d['current_dimensions'] = d[
            'all_dimensions'][:d['current_hidden_layer'] +
                              1]  # dimensions of model up to current depth

        # specify embedding function for current training phase
        # we want the option of skipping the embedding activation function to apply only to the full model
        #        if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']:
        #            d['current_apply_activation_to_embedding'] = False
        #        else:
        #            d['current_apply_activation_to_embedding'] = True
        if d['current_dimensions'] == d['all_dimensions']:
            if d['apply_activation_to_embedding']:
                d['current_apply_activation_to_embedding'] = True
                use_softmax = True
            else:
                d['current_apply_activation_to_embedding'] = False
                use_softmax = False
        else:
            d['current_apply_activation_to_embedding'] = True
            use_softmax = False
        print('current_apply_activation_to_embedding: {0!s}'.format(
            d['current_apply_activation_to_embedding']),
              flush=True)
        print('use_softmax: {0!s}'.format(use_softmax), flush=True)

        # specify rows and columns of figure showing data reconstructions
        d['reconstruction_rows'] = int(
            np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2)))
        d['reconstruction_cols'] = 2 * d['reconstruction_rows']

        # load model variables
        print('loading model variables...', flush=True)
        with open(
                '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                'rb') as fr:
            W, Be, Bd = pickle.load(fr)[1:]  # global_step, W, bencode, bdecode
        if d['use_batchnorm']:
            with open(
                    '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    'rb') as fr:
                batchnorm_variables = pickle.load(
                    fr)  # gammas, betas, moving_means, moving_variances
            batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables(
                batchnorm_variables,
                d['current_apply_activation_to_embedding'],
                d['apply_activation_to_output'])

        # compute embedding and reconstruction
        print('computing embedding and reconstruction...', flush=True)
        recon = {}
        embed = {}
        error = {}
        embed_preactivation = {}
        for partition in partitions:
            if d['use_batchnorm']:
                #                recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables)
                #                embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables)
                recon[partition], embed[partition], error[
                    partition] = sdae_apply_functions.encode_and_decode(
                        dataset[partition],
                        W,
                        Be,
                        Bd,
                        activation_function['np'],
                        d['current_apply_activation_to_embedding'],
                        use_softmax,
                        d['apply_activation_to_output'],
                        return_embedding=True,
                        return_reconstruction_error=True,
                        bn_encode_variables=batchnorm_encode_variables,
                        bn_decode_variables=batchnorm_decode_variables)
                embed_preactivation[partition] = sdae_apply_functions.encode(
                    dataset[partition],
                    W,
                    Be,
                    activation_function['np'],
                    apply_activation_to_embedding=False,
                    use_softmax=use_softmax,
                    bn_variables=batchnorm_encode_variables)
            else:
                #                recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True)
                #                embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False)
                recon[partition], embed[partition], error[
                    partition] = sdae_apply_functions.encode_and_decode(
                        dataset[partition],
                        W,
                        Be,
                        Bd,
                        activation_function['np'],
                        d['current_apply_activation_to_embedding'],
                        use_softmax,
                        d['apply_activation_to_output'],
                        return_embedding=True,
                        return_reconstruction_error=True)
                embed_preactivation[partition] = sdae_apply_functions.encode(
                    dataset[partition],
                    W,
                    Be,
                    activation_function['np'],
                    apply_activation_to_embedding=False,
                    use_softmax=use_softmax)

            print('{0} reconstruction error: {1:1.3g}'.format(
                partition, error[partition]),
                  flush=True)

            datasetIO.save_datamatrix(
                '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.pickle'
                .format(d['output_path'], partition, d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                embed[partition])
            datasetIO.save_datamatrix(
                '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz'
                .format(d['output_path'], partition, d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                embed[partition])

            if d['current_apply_activation_to_embedding']:
                datasetIO.save_datamatrix(
                    '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.pickle'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    embed_preactivation[partition])
                datasetIO.save_datamatrix(
                    '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    embed_preactivation[partition])

        # plot reconstructions
        print('plotting reconstructions...', flush=True)
        num_recons = min([
            d['reconstruction_rows'] * d['reconstruction_cols'],
            dataset['valid'].shape[0]
        ])
        x_valid = dataset['valid'].matrix[:num_recons, :]
        xr_valid = recon['valid'].matrix[:num_recons, :]
        if x_valid.shape[1] > 1000:
            x_valid = x_valid[:, :1000]
            xr_valid = xr_valid[:, :1000]
        lb = np.append(x_valid, xr_valid, 1).min(1)
        ub = np.append(x_valid, xr_valid, 1).max(1)
        fg, axs = plt.subplots(d['reconstruction_rows'],
                               d['reconstruction_cols'],
                               figsize=(6.5, 3.25))
        for i, ax in enumerate(axs.reshape(-1)):
            if i < num_recons:
                ax.plot(x_valid[i, :],
                        xr_valid[i, :],
                        'ok',
                        markersize=0.5,
                        markeredgewidth=0)
                ax.set_ylim(lb[i], ub[i])
                ax.set_xlim(lb[i], ub[i])
                ax.tick_params(axis='both',
                               which='major',
                               left=False,
                               right=False,
                               bottom=False,
                               top=False,
                               labelleft=False,
                               labelright=False,
                               labelbottom=False,
                               labeltop=False,
                               pad=4)
                ax.set_frame_on(False)
                ax.axvline(lb[i], linewidth=1, color='k')
                ax.axvline(ub[i], linewidth=1, color='k')
                ax.axhline(lb[i], linewidth=1, color='k')
                ax.axhline(ub[i], linewidth=1, color='k')
            else:
                fg.delaxes(ax)
        fg.savefig(
            '{0}/intermediate_reconstructions_layer{1!s}_finetuning{2!s}_step{3!s}.png'
            .format(d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run'], selected_step),
            transparent=True,
            pad_inches=0,
            dpi=1200)
        plt.close()

        # plot 2d embedding
        if d['current_dimensions'][-1] == 2:
            print('plotting 2d embedding...', flush=True)
            fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
            ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
            ax.plot(embed['train'].matrix[:, 0],
                    embed['train'].matrix[:, 1],
                    'ok',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=0.5,
                    zorder=0)
            ax.plot(embed['valid'].matrix[:, 0],
                    embed['valid'].matrix[:, 1],
                    'or',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=1.0,
                    zorder=1)
            ax.tick_params(axis='both',
                           which='major',
                           bottom=False,
                           top=False,
                           labelbottom=False,
                           labeltop=False,
                           left=False,
                           right=False,
                           labelleft=False,
                           labelright=False,
                           pad=4)
            ax.set_frame_on(False)
            fg.savefig(
                '{0}/intermediate_embedding_layer{1!s}_finetuning{2!s}_step{3!s}.png'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                transparent=True,
                pad_inches=0,
                dpi=600)
            plt.close()

            if d['current_apply_activation_to_embedding']:
                fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
                ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
                ax.plot(embed_preactivation['train'].matrix[:, 0],
                        embed_preactivation['train'].matrix[:, 1],
                        'ok',
                        markersize=2,
                        markeredgewidth=0,
                        alpha=0.5,
                        zorder=0)
                ax.plot(embed_preactivation['valid'].matrix[:, 0],
                        embed_preactivation['valid'].matrix[:, 1],
                        'or',
                        markersize=2,
                        markeredgewidth=0,
                        alpha=1.0,
                        zorder=1)
                ax.tick_params(axis='both',
                               which='major',
                               bottom=False,
                               top=False,
                               labelbottom=False,
                               labeltop=False,
                               left=False,
                               right=False,
                               labelleft=False,
                               labelright=False,
                               pad=4)
                ax.set_frame_on(False)
                fg.savefig(
                    '{0}/intermediate_embedding_preactivation_layer{1!s}_finetuning{2!s}_step{3!s}.png'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    transparent=True,
                    pad_inches=0,
                    dpi=600)
                plt.close()
        # plot heatmap
        else:
            print('plotting embedding heatmap...', flush=True)
            for partition in partitions:
                if 'all' not in embed:
                    embed['all'] = copy.deepcopy(embed[partition])
                else:
                    embed['all'].append(embed[partition], 0)
            embed['all'].cluster('all', 'cosine', 'average')
            embed['all'].heatmap(
                rowmetalabels=[],
                columnmetalabels=[],
                normalize=False,
                standardize=False,
                normalizebeforestandardize=True,
                cmap_name='bwr',
                ub=None,
                lb=None,
                savefilename=
                '{0}/intermediate_embedding_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                closefigure=True,
                dpi=300)
            if d['current_apply_activation_to_embedding']:
                for partition in partitions:
                    if 'all' not in embed_preactivation:
                        embed_preactivation['all'] = copy.deepcopy(
                            embed_preactivation[partition])
                    else:
                        embed_preactivation['all'].append(
                            embed_preactivation[partition], 0)
                embed_preactivation['all'].cluster('all', 'cosine', 'average')
                embed_preactivation['all'].heatmap(
                    rowmetalabels=[],
                    columnmetalabels=[],
                    normalize=False,
                    standardize=False,
                    normalizebeforestandardize=True,
                    cmap_name='bwr',
                    ub=None,
                    lb=None,
                    savefilename=
                    '{0}/intermediate_embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    closefigure=True,
                    dpi=300)

    print('done get_sdae_features.', flush=True)
def main(d):
    # d is a dictionary containing the auto-encoder design specifications and training phase specifications

    # RESET DEFAULT GRAPH
    print('resetting default graph...', flush=True)
    tf.reset_default_graph()

    # FINISH CONFIGURATION
    print('finishing configuration...', flush=True)

    # specify noise distribution
    if d['noise_distribution'] == 'truncnorm':
        noise_distribution = tf.truncated_normal
    elif d['noise_distribution'] == 'uniform':
        noise_distribution = tf.random_uniform

    # specify distribution of initial weights
    if d['initialization_distribution'] == 'truncnorm':
        initialization_distribution = tf.truncated_normal

    # specify activation function
    if d['activation_function'] == 'tanh':
        activation_function = {'tf': tf.tanh, 'np': sdae_apply_functions.tanh}
    elif d['activation_function'] == 'relu':
        activation_function = {
            'tf': tf.nn.relu,
            'np': sdae_apply_functions.relu
        }
    elif d['activation_function'] == 'elu':
        activation_function = {'tf': tf.nn.elu, 'np': sdae_apply_functions.elu}
    elif d['activation_function'] == 'sigmoid':
        activation_function = {
            'tf': tf.sigmoid,
            'np': sdae_apply_functions.sigmoid
        }

    # load data
    partitions = ['train', 'valid', 'test']
    dataset = {}
    for partition in partitions:
        dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(
            d['input_path'], partition))
        d['{0}_examples'.format(partition)] = dataset[partition].shape[0]

    # create output directory
    if not os.path.exists(d['output_path']):
        os.makedirs(d['output_path'])

    # initialize model architecture (number of layers and dimension of each layer)
    d['current_dimensions'] = d[
        'all_dimensions'][:d['current_hidden_layer'] +
                          1]  # dimensions of model up to current depth

    # specify embedding function for current training phase
    # we want the option of skipping the embedding activation function to apply only to the full model
    if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[
            'all_dimensions']:
        d['current_apply_activation_to_embedding'] = False
    else:
        d['current_apply_activation_to_embedding'] = True

    # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent
    d['batch_size'] = d['batch_fraction'] * d['train_examples']
    batch_ids = create_batch_ids(d['train_examples'], d['batch_size'])
    d['batches'] = np.unique(batch_ids).size
    d['steps'] = d['current_epochs'] * d['batches']

    # specify path to weights from previous training run
    d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
        d['output_path'], d['previous_hidden_layer'],
        d['previous_finetuning_run'])
    d['fix_or_init'] = 'fix' if d[
        'current_finetuning_run'] == 0 else 'init'  # fix for pretraining, init for finetuning

    # specify rows and columns of figure showing data reconstructions
    d['reconstruction_rows'] = int(
        np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2)))
    d['reconstruction_cols'] = 2 * d['reconstruction_rows']

    # print some design information
    print('input path: {0}'.format(d['input_path']), flush=True)
    print('output path: {0}'.format(d['output_path']), flush=True)
    print('previous variables path: {0}'.format(d['previous_variables_path']),
          flush=True)
    print('previous variables fix or init: {0}'.format(d['fix_or_init']),
          flush=True)

    # SAVE CURRENT DESIGN
    print('saving current design...', flush=True)
    with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format(
            d['output_path'], d['current_hidden_layer'],
            d['current_finetuning_run']),
              mode='wt',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        json.dump(d, fw, indent=2)

    # DEFINE REPORTING VARIABLES
    print('defining reporting variables...', flush=True)
    reporting_steps = sdae_design_functions.create_reporting_steps(
        d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint'])
    valid_losses = np.zeros(reporting_steps.size, dtype='float32')
    train_losses = np.zeros(reporting_steps.size, dtype='float32')
    valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32')
    train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32')
    print('reporting steps:', reporting_steps, flush=True)

    # DEFINE COMPUTATIONAL GRAPH
    # define placeholders for input data, use None to allow feeding different numbers of examples
    print('defining placeholders...', flush=True)
    noise_stdv = tf.placeholder(tf.float32, [])
    noise_prob = tf.placeholder(tf.float32, [])
    training_and_validation_data_initializer = tf.placeholder(
        tf.float32, [
            dataset['train'].shape[0] + dataset['valid'].shape[0],
            dataset['train'].shape[1]
        ])
    selection_mask = tf.placeholder(
        tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]])

    # define variables
    # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding
    print('defining variables...', flush=True)
    training_and_validation_data = tf.Variable(
        training_and_validation_data_initializer,
        trainable=False,
        collections=[])
    if os.path.exists(d['previous_variables_path']):
        # update variables (if continuing from a previous training run)
        print('loading previous variables...', flush=True)
        global_step, W, bencode, bdecode = update_variables(
            d['current_dimensions'], initialization_distribution,
            d['initialization_sigma'], d['previous_variables_path'],
            d['fix_or_init'], d['include_global_step'])
    elif d['current_hidden_layer'] == 1 and d['current_finetuning_run'] == 0:
        # create variables
        global_step, W, bencode, bdecode = create_variables(
            d['current_dimensions'], initialization_distribution,
            d['initialization_sigma'])
    else:
        raise ValueError('could not find previous variables')

    # define model
    # h contains the activations from input layer to bottleneck layer
    # hhat contains the activations from bottleneck layer to output layer
    # xhat is a reference to the output layer (i.e. the reconstruction)
    print('defining model...', flush=True)
    x = tf.boolean_mask(training_and_validation_data, selection_mask)
    if d['noise_distribution'] == 'truncnorm':
        noise = noise_distribution(tf.shape(x), stddev=noise_stdv)
    else:
        noise = noise_distribution(tf.shape(x),
                                   minval=-noise_stdv,
                                   maxval=noise_stdv)
    noise_mask = tf.to_float(tf.random_uniform(tf.shape(x)) <= noise_prob)
    xnoisy = apply_noise(x, noise, noise_mask, d['noise_operation'])
    h, hhat, xhat = create_autoencoder(
        xnoisy, activation_function['tf'], d['apply_activation_to_output'],
        d['current_apply_activation_to_embedding'], W, bencode, bdecode)

    # define loss
    print('defining loss...', flush=True)
    loss = tf.reduce_mean(tf.squared_difference(x, xhat))  # squared error loss

    # define optimizer and training function
    print('defining optimizer and training function...', flush=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'],
                                       epsilon=d['epsilon'],
                                       beta1=d['beta1'],
                                       beta2=d['beta2'])
    train_fn = optimizer.minimize(loss, global_step=global_step)

    # define bottleneck layer preactivation
    #    bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1]

    # INITIALIZE TENSORFLOW SESSION
    print('initializing tensorflow session...', flush=True)
    init = tf.global_variables_initializer()
    session_config = configure_session(d['processor'],
                                       d['gpu_memory_fraction'])
    with tf.Session(config=session_config) as sess:
        sess.run(init)

        # TRAINING
        print('training...', flush=True)
        sess.run(training_and_validation_data.initializer,
                 feed_dict={
                     training_and_validation_data_initializer:
                     np.append(dataset['train'].matrix,
                               dataset['valid'].matrix, 0)
                 })
        validation_id = -1
        batch_and_validation_ids = np.full(dataset['train'].shape[0] +
                                           dataset['valid'].shape[0],
                                           validation_id,
                                           dtype=batch_ids.dtype)
        is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'),
                             np.zeros(dataset['valid'].shape[0], dtype='bool'))
        is_valid = ~is_train
        training_step = 0
        i = 0
        overfitting_score = 0
        stopearly = False
        starttime = time.time()

        with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']),
                  mode='wt',
                  buffering=1) as fl:
            fl.write('\t'.join([
                'step', 'train_loss', 'valid_loss', 'train_noisy_loss',
                'valid_noisy_loss', 'time'
            ]) + '\n')

            for epoch in range(d['current_epochs']):
                if stopearly:
                    break

                # randomize assignment of training examples to batches
                np.random.shuffle(batch_ids)
                batch_and_validation_ids[is_train] = batch_ids

                for batch in range(d['batches']):
                    training_step += 1

                    # select mini-batch
                    selected = batch_and_validation_ids == batch

                    # update weights
                    sess.run(train_fn,
                             feed_dict={
                                 selection_mask: selected,
                                 noise_prob: d['noise_probability'],
                                 noise_stdv: d['noise_sigma']
                             })

                    # record training and validation errors
                    if training_step == reporting_steps[i]:
                        train_losses[i] = sess.run(loss,
                                                   feed_dict={
                                                       selection_mask:
                                                       is_train,
                                                       noise_prob: 0,
                                                       noise_stdv: 0
                                                   })
                        train_noisy_losses[i] = sess.run(
                            loss,
                            feed_dict={
                                selection_mask: is_train,
                                noise_prob: d['noise_probability'],
                                noise_stdv: d['noise_sigma']
                            })
                        valid_losses[i] = sess.run(loss,
                                                   feed_dict={
                                                       selection_mask:
                                                       is_valid,
                                                       noise_prob: 0,
                                                       noise_stdv: 0
                                                   })
                        valid_noisy_losses[i] = sess.run(
                            loss,
                            feed_dict={
                                selection_mask: is_valid,
                                noise_prob: d['noise_probability'],
                                noise_stdv: d['noise_sigma']
                            })
                        print(
                            'step:{0:1.6g}, train loss:{1:1.3g}, valid loss:{2:1.3g}, train noisy loss:{3:1.3g},valid noisy loss:{4:1.3g}, time:{5:1.6g}'
                            .format(reporting_steps[i], train_losses[i],
                                    valid_losses[i], train_noisy_losses[i],
                                    valid_noisy_losses[i],
                                    time.time() - starttime),
                            flush=True)
                        fl.write('\t'.join([
                            '{0:1.6g}'.format(x) for x in [
                                reporting_steps[i], train_losses[i],
                                valid_losses[i], train_noisy_losses[i],
                                valid_noisy_losses[i],
                                time.time() - starttime
                            ]
                        ]) + '\n')

                        # save current weights, reconstructions, and projections
                        if training_step >= d[
                                'startsavingstep'] or training_step == reporting_steps[
                                    -1]:
                            with open(
                                    '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                                    .format(d['output_path'],
                                            d['current_hidden_layer'],
                                            d['current_finetuning_run'],
                                            training_step), 'wb') as fw:
                                pickle.dump(
                                    (sess.run(global_step), sess.run(W),
                                     sess.run(bencode), sess.run(bdecode)), fw)

                            # stop early if overfitting
                            if valid_losses[i] >= 1.01 * (np.insert(
                                    valid_losses[:i], 0, np.inf).min()):
                                overfitting_score += 1
                            else:
                                overfitting_score = 0
                            if overfitting_score == d['overfitting_score_max']:
                                stopearly = True
                                print('stopping early!', flush=True)
                                break
                        i += 1

        # end tensorflow session
        print('closing tensorflow session...', flush=True)

    # ROLL BACK IF OVERFITTING
    if stopearly:
        print('rolling back...', flush=True)
        reporting_steps = reporting_steps[:i + 1]
        train_losses = train_losses[:i + 1]
        valid_losses = valid_losses[:i + 1]
        train_noisy_losses = train_noisy_losses[:i + 1]
        valid_noisy_losses = valid_noisy_losses[:i + 1]
#        selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']])
    else:
        print('completed all training steps...', flush=True)
#        selected_step = reporting_steps[-1]
    selected_step = min([
        max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]),
        reporting_steps[-1]
    ])
    print('selected step:{0}...'.format(selected_step), flush=True)

    # SAVE RESULTS
    print('saving results...', flush=True)
    with open(
            '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']), 'wb') as fw:
        pickle.dump(
            {
                'reporting_steps': reporting_steps,
                'valid_losses': valid_losses,
                'train_losses': train_losses,
                'valid_noisy_losses': valid_noisy_losses,
                'train_noisy_losses': train_noisy_losses
            }, fw)
    if d['current_dimensions'] == d['all_dimensions'] and (
            not d['use_finetuning'] or d['current_finetuning_run'] > 0):
        shutil.copyfile(
            '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
            .format(d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run'], selected_step),
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']))
    else:
        shutil.move(
            '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
            .format(d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run'], selected_step),
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']))
    with open(
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']), 'rb') as fr:
        W, Be, Bd = pickle.load(fr)[1:]  # global_step, W, bencode, bdecode
    recon = {}
    embed = {}
    error = {}
    embed_preactivation = {}
    for partition in partitions:
        recon[partition], embed[partition], error[
            partition] = sdae_apply_functions.encode_and_decode(
                dataset[partition],
                W,
                Be,
                Bd,
                activation_function['np'],
                d['current_apply_activation_to_embedding'],
                d['apply_activation_to_output'],
                return_embedding=True,
                return_reconstruction_error=True)
        embed_preactivation[partition] = sdae_apply_functions.encode(
            dataset[partition],
            W,
            Be,
            activation_function['np'],
            apply_activation_to_embedding=False)
        print('{0} reconstruction error: {1:1.3g}'.format(
            partition, error[partition]),
              flush=True)
        if d['current_dimensions'] == d['all_dimensions'] and (
                not d['use_finetuning'] or d['current_finetuning_run'] > 0):
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            if d['current_apply_activation_to_embedding']:
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])

    # PLOT LOSS
    print('plotting loss...', flush=True)
    fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25))
    ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25])
    ax.semilogx(reporting_steps,
                train_losses,
                ':r',
                linewidth=1,
                label='train')
    ax.semilogx(reporting_steps,
                valid_losses,
                '-g',
                linewidth=1,
                label='valid')
    ax.semilogx(reporting_steps,
                train_noisy_losses,
                '--b',
                linewidth=1,
                label='train,noisy')
    ax.semilogx(reporting_steps,
                valid_noisy_losses,
                '-.k',
                linewidth=1,
                label='valid,noisy')
    ax.legend(loc='best', fontsize=8)
    ax.set_ylabel('loss', fontsize=8)
    ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step),
                  fontsize=8)
    ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1)
    # ax.set_ylim(0, 1)
    ax.tick_params(axis='both',
                   which='major',
                   left='on',
                   right='on',
                   bottom='on',
                   top='off',
                   labelleft='on',
                   labelright='off',
                   labelbottom='on',
                   labeltop='off',
                   labelsize=8)
    fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(
        d['output_path'], d['current_hidden_layer'],
        d['current_finetuning_run']),
               transparent=True,
               pad_inches=0,
               dpi=600)
    plt.close()

    # PLOT RECONSTRUCTIONS
    print('plotting reconstructions...', flush=True)
    x_valid = dataset['valid'].matrix[:d['reconstruction_rows'] *
                                      d['reconstruction_cols'], :]
    xr_valid = recon['valid'].matrix[:d['reconstruction_rows'] *
                                     d['reconstruction_cols'], :]
    if x_valid.shape[1] > 1000:
        x_valid = x_valid[:, :1000]
        xr_valid = xr_valid[:, :1000]
    lb = np.append(x_valid, xr_valid, 1).min(1)
    ub = np.append(x_valid, xr_valid, 1).max(1)
    fg, axs = plt.subplots(d['reconstruction_rows'],
                           d['reconstruction_cols'],
                           figsize=(6.5, 3.25))
    for i, ax in enumerate(axs.reshape(-1)):
        ax.plot(x_valid[i, :],
                xr_valid[i, :],
                'ok',
                markersize=0.5,
                markeredgewidth=0)
        ax.set_ylim(lb[i], ub[i])
        ax.set_xlim(lb[i], ub[i])
        ax.tick_params(axis='both',
                       which='major',
                       left='off',
                       right='off',
                       bottom='off',
                       top='off',
                       labelleft='off',
                       labelright='off',
                       labelbottom='off',
                       labeltop='off',
                       pad=4)
        ax.set_frame_on(False)
        ax.axvline(lb[i], linewidth=1, color='k')
        ax.axvline(ub[i], linewidth=1, color='k')
        ax.axhline(lb[i], linewidth=1, color='k')
        ax.axhline(ub[i], linewidth=1, color='k')
    fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(
        d['output_path'], d['current_hidden_layer'],
        d['current_finetuning_run']),
               transparent=True,
               pad_inches=0,
               dpi=1200)
    plt.close()

    # PLOT 2D EMBEDDING
    if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or
                                             d['current_finetuning_run'] > 0):
        print('plotting 2d embedding...', flush=True)
        fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
        ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
        ax.plot(embed['train'].matrix[:, 0],
                embed['train'].matrix[:, 1],
                'ok',
                markersize=2,
                markeredgewidth=0,
                alpha=0.5,
                zorder=0)
        ax.plot(embed['valid'].matrix[:, 0],
                embed['valid'].matrix[:, 1],
                'or',
                markersize=2,
                markeredgewidth=0,
                alpha=1.0,
                zorder=1)
        ax.tick_params(axis='both',
                       which='major',
                       bottom='off',
                       top='off',
                       labelbottom='off',
                       labeltop='off',
                       left='off',
                       right='off',
                       labelleft='off',
                       labelright='off',
                       pad=4)
        ax.set_frame_on(False)
        fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(
            d['output_path'], d['current_hidden_layer'],
            d['current_finetuning_run']),
                   transparent=True,
                   pad_inches=0,
                   dpi=600)
        plt.close()

        if d['current_apply_activation_to_embedding']:
            fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
            ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
            ax.plot(embed_preactivation['train'].matrix[:, 0],
                    embed_preactivation['train'].matrix[:, 1],
                    'ok',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=0.5,
                    zorder=0)
            ax.plot(embed_preactivation['valid'].matrix[:, 0],
                    embed_preactivation['valid'].matrix[:, 1],
                    'or',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=1.0,
                    zorder=1)
            ax.tick_params(axis='both',
                           which='major',
                           bottom='off',
                           top='off',
                           labelbottom='off',
                           labeltop='off',
                           left='off',
                           right='off',
                           labelleft='off',
                           labelright='off',
                           pad=4)
            ax.set_frame_on(False)
            fg.savefig(
                '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run']),
                transparent=True,
                pad_inches=0,
                dpi=600)
            plt.close()

    print('done training phase.', flush=True)

    return d['current_hidden_layer'], d['current_finetuning_run'], d[
        'current_epochs']
def main():

    # load class examples
    print('loading class examples...', flush=True)
    class_examples_folder = 'targets/pharmaprojects'
    class_examples = {
        'positive':
        datasetIO.load_examples(
            '{0}/positive.txt'.format(class_examples_folder)),
        'negative':
        datasetIO.load_examples(
            '{0}/negative.txt'.format(class_examples_folder)),
        'unknown':
        datasetIO.load_examples(
            '{0}/unknown.txt'.format(class_examples_folder))
    }

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/harmonizome/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/candidate_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])
        dataset_info['original_genes'] = gene_atb.shape[0]
        dataset_info['original_features'] = gene_atb.shape[1]

        # decide feature normalization
        print('deciding feature normalization...', flush=True)
        if ('standardized' in dataset_info['abbreviation']
                or 'cleaned' in dataset_info['abbreviation']
            ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5:
            # dataset is many-valued and filled-in
            print('    dataset is many-valued and filled-in...', flush=True)
            print('    z-scoring features...', flush=True)
            dataset_info['feature_normalization'] = 'z-score'
            mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True)
            sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True)
            gene_atb.matrix = (gene_atb.matrix - mnv) / sdv
            gene_atb.columnmeta['mean'] = mnv.reshape(-1)
            gene_atb.columnmeta['stdv'] = sdv.reshape(-1)
        else:
            # dataset is binary or tertiary or sparse
            print('    dataset is binary, tertiary, or sparse...', flush=True)
            print('    no feature normalization...', flush=True)
            dataset_info['feature_normalization'] = 'none'

        # assign class labels to genes
        print('assigning class labels to genes...', flush=True)
        gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0],
                                            'unknown',
                                            dtype='object')
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['positive']))] = 'positive'
        gene_atb.rowmeta['class'][np.in1d(
            gene_atb.rowlabels, list(class_examples['negative']))] = 'negative'

        # add dataset mean and stdv as features
        print('adding dataset mean and stdv as features...', flush=True)
        gene_stat = dataclasses.datamatrix(
            rowname=gene_atb.rowname,
            rowlabels=gene_atb.rowlabels.copy(),
            rowmeta=copy.deepcopy(gene_atb.rowmeta),
            columnname=gene_atb.columnname,
            columnlabels=np.array(['mean', 'stdv'], dtype='object'),
            columnmeta={},
            matrixname=gene_atb.matrixname,
            matrix=np.append(gene_atb.matrix.mean(1, keepdims=True),
                             gene_atb.matrix.std(1, keepdims=True), 1))
        gene_atb.append(gene_stat, 1)
        gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels,
                                                   gene_stat.columnlabels)
        del gene_stat

        # identify features with little information about labelled examples
        print(
            'identifying features with little information about labelled examples...',
            flush=True)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        tobediscarded = np.logical_or.reduce(
            ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3,
             (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3,
             np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0)))
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save candidate features
            print('    saving {0!s} candidate features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['candidate_genes'] = gene_atb.shape[0]
            dataset_info['candidate_features'] = gene_atb.shape[1]
            dataset_info['positive_examples'] = (
                gene_atb.rowmeta['class'] == 'positive').sum()
            dataset_info['negative_examples'] = (
                gene_atb.rowmeta['class'] == 'negative').sum()
            dataset_info['unknown_examples'] = (
                gene_atb.rowmeta['class'] == 'unknown').sum()
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
Beispiel #6
0
            columnidx = columnlabel_idx[columnlabel]
            snp_genome.columnmeta[metalabel][columnidx] = value
    uvals, counts = np.unique(snp_genome.columnmeta[metalabel],
                              return_counts=True)
    max_num_uvals = 25
    if uvals.size > max_num_uvals:
        si = np.argsort(counts)[::-1]
        low_freq_uvals = uvals[si[max_num_uvals:]]
        snp_genome.columnmeta[metalabel][np.in1d(
            snp_genome.columnmeta[metalabel], low_freq_uvals)] = 'NA'

# save the data
print('saving prepared data...', flush=True)
snp_genome.matrixname += '_prepared'
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle',
    snp_genome)
datasetIO.save_datamatrix(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz',
    snp_genome)
savefolder = '../../input_data/1000genomes_genomes'
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, snp_genome)
shutil.copyfile(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.pickle',
    '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile(
    '../../original_data/1000genomes/snp_genome_1000genomes-phased-MHC_prepared.txt.gz',
    '{0}/datamatrix.txt.gz'.format(savefolder))
    '../../original_data/phenodigm/geneid_meshid_datamatrix_trimmed.csv.gz',
    delimiter=',',
    getmetadata=False)
gene_atb.rowname = 'entrez_id'
gene_atb.columnname = 'mesh_id'
gene_atb.matrixname = 'gene_disease_associations_from_phenodigm-qtq'

# THRESHOLD the data
# what do the values mean?
# values have a strange distribution. 50% are less than 0.2, 97% are less than 0.5. min value is 0.08. max value is 1.15.
print('thresholding data...', flush=True)
gene_atb.matrix = np.float64(gene_atb.matrix > 0)
gene_atb.matrixname += '_thresholded'
print('saving thresholded data...', flush=True)
datasetIO.save_datamatrix(
    '../../original_data/phenodigm/gene_disease_phenodigm-qtq_trimmed_thresholded.pickle',
    gene_atb)
datasetIO.save_datamatrix(
    '../../original_data/phenodigm/gene_disease_phenodigm-qtq_trimmed_thresholded.txt.gz',
    gene_atb)

# shuffle the data
print('shuffling data...', flush=True)
gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0)
gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1)
print(gene_atb)

# add hgnc metadata
print('adding hgnc metadata data...', flush=True)
hgncmetadata = mapper.annotate_genes(
    field='entrez_id',
Beispiel #8
0
def create_and_save_partitions(dataset,
                               study_name,
                               test_fraction=0.1,
                               valid_fraction=0.1,
                               save_text_files=False):

    # determine dataset orientation
    orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat'

    # partition the data
    tobepopped = np.random.permutation(dataset.shape[0]) < round(
        max([test_fraction * dataset.shape[0], 2.0]))
    dataset_test = dataset.pop(tobepopped, 0)
    print('    TEST', flush=True)
    print(dataset_test)
    tobepopped = np.random.permutation(dataset.shape[0]) < round(
        max([valid_fraction * dataset.shape[0], 2.0]))
    dataset_valid = dataset.pop(tobepopped, 0)
    print('    VALID', flush=True)
    print(dataset_valid)
    dataset_train = dataset
    print('    TRAIN', flush=True)
    print(dataset_train)

    # save data partitions
    savefolder = '../partitioned_data/{0}/{1}'.format(study_name, orientation)
    print('    SAVING PARTITIONS TO {0}'.format(savefolder), flush=True)
    os.makedirs(savefolder)
    datasetIO.save_datamatrix('{0}/test.pickle'.format(savefolder),
                              dataset_test)
    datasetIO.save_datamatrix('{0}/valid.pickle'.format(savefolder),
                              dataset_valid)
    datasetIO.save_datamatrix('{0}/train.pickle'.format(savefolder),
                              dataset_train)
    if save_text_files:
        datasetIO.save_datamatrix('{0}/test.txt.gz'.format(savefolder),
                                  dataset_test)
        datasetIO.save_datamatrix('{0}/valid.txt.gz'.format(savefolder),
                                  dataset_valid)
        datasetIO.save_datamatrix('{0}/train.txt.gz'.format(savefolder),
                                  dataset_train)
def create_and_save_partitions(dataset, study_name, test_fraction=0.1, valid_fraction=0.1, save_text_files=False):
    
    # determine dataset orientation
    orientation = 'skinny' if dataset.shape[0] > dataset.shape[1] else 'fat'
    
    # partition the data
    tobepopped = np.random.permutation(dataset.shape[0]) < round(max([test_fraction*dataset.shape[0], 2.0]))
    dataset_test = dataset.pop(tobepopped, 0)
    print('    TEST', flush=True)
    print(dataset_test)
    tobepopped = np.random.permutation(dataset.shape[0]) < round(max([valid_fraction*dataset.shape[0], 2.0]))
    dataset_valid = dataset.pop(tobepopped, 0)
    print('    VALID', flush=True)
    print(dataset_valid)
    dataset_train = dataset
    print('    TRAIN', flush=True)
    print(dataset_train)
    
    # save data partitions
    print('    SAVING PARTITIONS TO data/prepared_data/{0}/{1}'.format(study_name, orientation), flush=True)
    if not os.path.exists('data/prepared_data/{0}/{1}'.format(study_name, orientation)):
        os.makedirs('data/prepared_data/{0}/{1}'.format(study_name, orientation))
    if not os.path.exists('results/autoencoder/{0}/{1}'.format(study_name, orientation)):
        os.makedirs('results/autoencoder/{0}/{1}'.format(study_name, orientation)) # anticipate needing directories for model results
    datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/test.pickle'.format(study_name, orientation), dataset_test)
    datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/valid.pickle'.format(study_name, orientation), dataset_valid)
    datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/train.pickle'.format(study_name, orientation), dataset_train)
    if save_text_files:
        datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/test.txt.gz'.format(study_name, orientation), dataset_test)
        datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/valid.txt.gz'.format(study_name, orientation), dataset_valid)
        datasetIO.save_datamatrix('data/prepared_data/{0}/{1}/train.txt.gz'.format(study_name, orientation), dataset_train)
def main(validation_rep=0, validation_fold=0):

    # load target clusters
    print('loading target cluster assignments...', flush=True)
    target_cluster_path = 'targets/clusters/gene_cluster_byfamily.pickle'
    gene_cluster = datasetIO.load_clusterassignments(target_cluster_path)

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # load validation examples
    print('loading validation examples...', flush=True)
    validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format(
        validation_rep, validation_fold)
    with open(validation_examples_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        validation_examples = fr.read().split('\n')

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/generalizable_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

        # specify feature generalizability test parameters
        print('specifying feature generalizability test parameters...',
              flush=True)
        dataset_info[
            'feature_generalizability_test_function'] = featureselection.univariate_grouppreserved_permtest
        dataset_info[
            'feature_generalizability_test_permutations'] = 10000  # 100000
        dataset_info[
            'feature_generalizability_test_targetclusterpath'] = target_cluster_path
        dataset_info[
            'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction
        dataset_info[
            'multiple_hypothesis_testing_correction_method'] = 'fdr_by'
        dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05
        print('   feature_generalizability_test_function: {0}'.format(
            dataset_info['feature_generalizability_test_function']),
              flush=True)
        print('   feature_generalizability_test_permutations: {0!s}'.format(
            dataset_info['feature_generalizability_test_permutations']),
              flush=True)
        print('   feature_generalizability_test_targetclusterpath: {0}'.format(
            dataset_info['feature_generalizability_test_targetclusterpath']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_function: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_function']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_method: {0}'.format(
            dataset_info['multiple_hypothesis_testing_correction_method']),
              flush=True)
        print('   multiple_hypothesis_testing_correction_threshold: {0!s}'.
              format(dataset_info[
                  'multiple_hypothesis_testing_correction_threshold']),
              flush=True)

        # exclude validation and unlabeled examples from significance calculation
        print(
            'excluding validation and unlabeled examples from significance calculation...',
            flush=True)
        isvalidation = np.in1d(gene_atb.rowlabels, validation_examples)
        isunknown = gene_atb.rowmeta['class'] == 'unknown'
        istraintest = ~np.logical_or(isvalidation, isunknown)

        # compute feature generalizability with multiple hypothesis testing correction
        print(
            'computing feature generalizability with multiple hypothesis testing correction...',
            flush=True)
        gene_atb.rowmeta['cluster'] = np.array([
            gene_cluster[g] if g in gene_cluster else -1
            for g in gene_atb.rowlabels
        ],
                                               dtype='int64')
        gene_atb.columnmeta[
            'generalizability_test_statistic_values'], gene_atb.columnmeta[
                'generalizability_pvalues'] = dataset_info[
                    'feature_generalizability_test_function'](
                        X=gene_atb.matrix[istraintest, :],
                        Y=(gene_atb.rowmeta['class'][istraintest] == 'positive'
                           ),
                        G=gene_atb.rowmeta['cluster'][istraintest],
                        numperm=dataset_info[
                            'feature_generalizability_test_permutations'])
        gene_atb.columnmeta['is_generalizable'], gene_atb.columnmeta[
            'generalizability_pvalues_corrected'] = dataset_info[
                'multiple_hypothesis_testing_correction_function'](
                    gene_atb.columnmeta['generalizability_pvalues'],
                    alpha=dataset_info[
                        'multiple_hypothesis_testing_correction_threshold'],
                    method=dataset_info[
                        'multiple_hypothesis_testing_correction_method'])
        gene_atb.columnmeta['generalizability_correlation_sign'] = np.sign(
            gene_atb.columnmeta['generalizability_test_statistic_values'])
        if (gene_atb.columnmeta['generalizability_pvalues'] <
                1 / dataset_info['feature_generalizability_test_permutations']
            ).any():
            print(
                '    warning: not enough permutations to establish all pvalues...',
                flush=True)
        tobediscarded = np.logical_or(
            np.isnan(gene_atb.columnmeta['generalizability_pvalues']),
            np.isnan(
                gene_atb.columnmeta['generalizability_pvalues_corrected']))
        if tobediscarded.any():
            gene_atb.discard(tobediscarded, axis=1)

        # prioritize features
        print('prioritizing features...', flush=True)
        sortedindices = np.argsort(
            gene_atb.columnmeta['generalizability_pvalues_corrected'])
        gene_atb.reorder(sortedindices, axis=1)

        # save feature generalizability info
        print('saving feature generalizability info...', flush=True)
        with open('{0}/{1}_feature_generalizability_info.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            writelist = [
                'dataset', 'abbreviation', 'feature',
                'generalizability_test_statistic', 'generalizability_pvalue',
                'generalizability_pvalue_corrected', 'is_generalizable',
                'generalizability_correlation_sign', 'preferred_rowstat',
                'similar_features'
            ]
            fw.write('\t'.join(writelist) + '\n')
            for j, feature in enumerate(gene_atb.columnlabels):
                writelist = [
                    dataset_info['name'], dataset_info['abbreviation'],
                    feature, '{0:1.5g}'.format(gene_atb.columnmeta[
                        'generalizability_test_statistic_values'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['generalizability_pvalues'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.
                        columnmeta['generalizability_pvalues_corrected'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.columnmeta['is_generalizable'][j]),
                    '{0:1.5g}'.format(
                        gene_atb.
                        columnmeta['generalizability_correlation_sign'][j]),
                    gene_atb.columnmeta['preferred_rowstat'][j],
                    gene_atb.columnmeta['similar_features'][j]
                ]
                fw.write('\t'.join(writelist) + '\n')

        # discard features that are not generalizable
        print('discarding features that are not generalizable...', flush=True)
        tobediscarded = ~gene_atb.columnmeta['is_generalizable']
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save generalizable features
            print('    saving {0!s} generalizable features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['generalizable_genes'] = gene_atb.shape[0]
            dataset_info['generalizable_features'] = gene_atb.shape[1]
            dataset_info[
                'feature_generalizability_test_function'] = 'featureselection.univariate_grouppreserved_permtest'
            dataset_info[
                'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction'
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
os.makedirs(target_path)
os.makedirs(target_path.replace('data/prepared_data', 'results/autoencoder'))

train = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(
    source_path, 'train'))
tobediscarded = train.rowmeta['general_tissue'] == '-666'
train.discard(tobediscarded, 0)
Y = train.matrix.copy()
l = train.rowmeta['general_tissue'].copy()
L = np.unique(l)
X = np.float64(l.reshape(-1, 1) == L.reshape(1, -1))
X = np.append(X, np.ones((X.shape[0], 1), dtype='float64'), 1)
B, _, rank, singular_values = np.linalg.lstsq(X, Y, rcond=None)
Ypred = X.dot(B)
train.matrix = Y - Ypred
datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'train'), train)

valid = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(
    source_path, 'valid'))
tobediscarded = valid.rowmeta['general_tissue'] == '-666'
valid.discard(tobediscarded, 0)
Y = valid.matrix.copy()
l = valid.rowmeta['general_tissue'].copy()
X = np.float64(l.reshape(-1, 1) == L.reshape(1, -1))
X = np.append(X, np.ones((X.shape[0], 1), dtype='float64'), 1)
Ypred = X.dot(B)
valid.matrix = Y - Ypred
datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'valid'), valid)

test = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(source_path, 'test'))
tobediscarded = test.rowmeta['general_tissue'] == '-666'
    matrixname='clinical_variables_for_tumor_samples',
    matrix=np.concatenate(
        tuple(dataset.rowmeta[cv].reshape(-1, 1) for cv in clinical_variables),
        1).astype('float64'))
print(clinical_dataset, flush=True)

# append clinical variables
print('appending clinical variables...', flush=True)
dataset.append(clinical_dataset, 1)
dataset.matrixname += '_and_clinical_variables'
print(dataset, flush=True)

# save the data
print('saving data with clinical variables...', flush=True)
datasetIO.save_datamatrix(
    '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle',
    dataset)
datasetIO.save_datamatrix(
    '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.txt.gz',
    dataset)
savefolder = '../../input_data/pratfelip_transposed_plus_clinical'
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, dataset)
shutil.copyfile(
    '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle',
    '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile(
    '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.txt.gz',
    '{0}/datamatrix.txt.gz'.format(savefolder))
def main(dictionaries, year, datestamp, min_score, universe, n_prior,
         min_count, association_statistic, reference_datamatrix_path,
         save_predictions):

    print('begin benchmark_term-term_stats_from_termite.py')

    print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1]))
    print('year: {0}'.format(year))
    print('datestamp: {0}'.format(datestamp))
    print('min_score: {0!s}'.format(min_score))
    print('universe: {0}'.format(universe))
    print('n_prior: {0!s}'.format(n_prior))
    print('min_count: {0!s}'.format(min_count))
    print('association_statistic: {0}'.format(association_statistic))
    print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path))
    print('save_predictions: {0!s}'.format(save_predictions))

    # create figures folder
    print('creating figures folder...')
    figures_folder = 'benchmark_figures'
    if not os.path.exists(figures_folder):
        os.mkdir(figures_folder)

    # load counts datamatrix
    # this file is generated by count_term-term_pmids_from_termite.py
    print('loading counts datamatrix...')
    row_dictionary = dictionaries[
        0]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    column_dictionary = dictionaries[
        1]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score)
    term_term_counts_all = datasetIO.load_datamatrix(counts_datamatrix_path)
    print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path))
    print(term_term_counts_all)

    # load association statistic datamatrix
    # this file is generated by calc_term-term_stats_from_termite.py
    print('loading association statistic datamatrix...')
    stats_datamatrix_path = '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'.format(
        row_dictionary, column_dictionary, association_statistic, year,
        datestamp, min_score, universe, n_prior, min_count)
    term_term_stats_all = datasetIO.load_datamatrix(stats_datamatrix_path)
    print('stats_datamatrix_path: {0}'.format(stats_datamatrix_path))
    print(term_term_stats_all)

    # load reference datamatrix of positive and negative examples
    print('loading reference datamatrix of positive and negative examples...')
    term_term_ref = datasetIO.load_datamatrix(reference_datamatrix_path)
    print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path))
    print(term_term_ref)

    # align datamatrices to reference
    print('aligning datamatrices to reference...')
    term_term_counts = term_term_counts_all.tolabels(
        rowlabels=term_term_ref.rowlabels.copy(),
        columnlabels=term_term_ref.columnlabels.copy())
    term_term_stats = term_term_stats_all.tolabels(
        rowlabels=term_term_ref.rowlabels.copy(),
        columnlabels=term_term_ref.columnlabels.copy())

    # find term-term pairs with sufficient counts
    print('finding term-term pairs with sufficient counts...')
    I, J = (term_term_counts.matrix >= min_count).nonzero()
    num_sufficient = I.size
    print('term-term pairs with at least {0!s} counts: {1!s}'.format(
        min_count, num_sufficient))

    # find row_term_dicts and column_term_dicts
    print('finding row_term_dicts and column_term_dicts')
    row_term_dicts = np.unique(term_term_stats.rowmeta['term_dict'])
    column_term_dicts = np.unique(term_term_stats.columnmeta['term_dict'])

    # calculate performance on reference examples and write to dataframe
    print(
        'calculating performance on reference examples and writing to dataframe...'
    )
    dataframe_path = 'benchmark_term-term_stats_dataframe.txt'
    metaheaders = [
        'row_dictionary', 'column_dictionary', 'year', 'datestamp',
        'min_score', 'universe', 'n_prior', 'min_count',
        'association_statistic', 'reference_datamatrix_path', 'row_term_dict',
        'column_term_dict'
    ]
    statheaders = [
        'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'auroc', 'auprc',
        'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr',
        'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc',
        'cos', 'fnlp', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95',
        'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95',
        'dor_ub95', 'mi', 'nmi', 'iqr', 'min_value_association_statistic'
    ]
    with open(dataframe_path,
              mode='at',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        writelist = metaheaders + statheaders
        fw.write('\t'.join(writelist) + '\n')
        for row_term_dict in row_term_dicts:
            row_hidxs = (term_term_stats.rowmeta['term_dict'] == row_term_dict
                         ).nonzero()[0]
            for column_term_dict in column_term_dicts:
                print('working on {0}-{1} associations...'.format(
                    row_term_dict, column_term_dict))

                # get scores and labels
                print('getting scores and labels...')
                column_hidxs = (term_term_stats.columnmeta['term_dict'] ==
                                column_term_dict).nonzero()[0]
                hit = np.logical_and(np.in1d(I, row_hidxs),
                                     np.in1d(J, column_hidxs))
                Y = term_term_ref.matrix[I[hit], J[hit]]
                X = (term_term_stats.matrix[I[hit], J[hit]]).reshape(-1, 1)
                X_prime = X.copy()
                if association_statistic == 'mcc':
                    X_prime = (X_prime + 1) / 2
                xpmin = (X_prime[X_prime > 0]).min() / 2
                xpmax = 1 - (1 - (X_prime[X_prime < 1]).max()) / 2
                X_prime[X_prime == 0] = xpmin
                X_prime[X_prime == 1] = xpmax
                logitX = np.log10(X_prime / (1 - X_prime))

                # save score histograms
                print('saving score histograms...')
                values = X.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, association_statistic, title, save_path, 'auto',
                    (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, association_statistic, title, save_path, 'auto',
                    (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                values = logitX.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'logit ' + association_statistic, title, save_path,
                    'auto', (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'logit ' + association_statistic, title, save_path,
                    'auto', (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                # fit logistic regression classifier
                print('fitting logistic regression classifier...')
                robust_scaler = RobustScaler().fit(logitX)
                Z = robust_scaler.transform(logitX)
                logistic_regression_model = LogisticRegression(
                    penalty='l2',
                    C=1e3,
                    intercept_scaling=1.0,
                    class_weight='balanced').fit(Z, Y)

                if logistic_regression_model.classes_[1] == 1:
                    decision_function = logistic_regression_model.decision_function(
                        Z)
                else:
                    decision_function = -logistic_regression_model.decision_function(
                        Z)
                Y_pred = decision_function > 0
                min_value_association_statistic = (X.reshape(-1)[Y_pred]).min()

                # save decision function and predicted probability histograms
                print(
                    'saving decision function and predicted probability histograms...'
                )
                values = decision_function.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'decision fun ' + association_statistic, title,
                    save_path, 'auto', (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'decision fun ' + association_statistic, title,
                    save_path, 'auto', (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                values = (1 / (1 + np.exp(-decision_function))).reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'pred prob ' + association_statistic, title, save_path,
                    'auto', (0, 1), False)
                save_path = '{0}/{1}_{2}_zoomhist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'pred prob ' + association_statistic, title, save_path,
                    'auto', (0, 1), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                # compute roc and pr curves
                print('computing roc and pr curves...')
                fpr, tpr, thresholds = roc_curve(Y, decision_function)
                precision, recall, thresholds = precision_recall_curve(
                    Y, decision_function)

                auroc = roc_auc_score(Y, decision_function)
                auprc = average_precision_score(Y, decision_function)

                # save roc and pr curves
                print('saving roc and pr curves...')
                title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format(
                    universe[:5], association_statistic, row_term_dict[:5],
                    column_term_dict[:5], auprc)
                save_path = '{0}/{1}_{2}_prc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                fg, ax = plt.subplots(1, 1, figsize=(3, 2))
                ax.plot(recall, precision, '-k', linewidth=1)
                ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3,
                                 1.3 / 2])  # left, bottom, width, height
                ax.set_title(title, fontsize=8)
                ax.set_ylabel('Precision', fontsize=8, labelpad=4)
                ax.set_xlabel('Recall', fontsize=8, labelpad=2)
                ax.set_ylim((0, 1))
                ax.set_xlim((0, 1))
                ax.tick_params(axis='both',
                               which='major',
                               bottom=True,
                               top=False,
                               left=True,
                               right=False,
                               labelbottom=True,
                               labeltop=False,
                               labelleft=True,
                               labelright=False,
                               labelsize=8)
                ax.ticklabel_format(axis='both',
                                    style='sci',
                                    scilimits=(-3, 3),
                                    fontsize=8)
                ax.yaxis.offsetText.set_fontsize(8)
                ax.xaxis.offsetText.set_fontsize(8)
                fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300)
                plt.close()

                title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format(
                    universe[:5], association_statistic, row_term_dict[:5],
                    column_term_dict[:5], auroc)
                save_path = '{0}/{1}_{2}_roc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                fg, ax = plt.subplots(1, 1, figsize=(3, 2))
                ax.plot(fpr, tpr, '-k', linewidth=1)
                ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3,
                                 1.3 / 2])  # left, bottom, width, height
                ax.set_title(title, fontsize=8)
                ax.set_ylabel('Precision', fontsize=8, labelpad=4)
                ax.set_xlabel('Recall', fontsize=8, labelpad=2)
                ax.set_ylim((0, 1))
                ax.set_xlim((0, 1))
                ax.tick_params(axis='both',
                               which='major',
                               bottom=True,
                               top=False,
                               left=True,
                               right=False,
                               labelbottom=True,
                               labeltop=False,
                               labelleft=True,
                               labelright=False,
                               labelsize=8)
                ax.ticklabel_format(axis='both',
                                    style='sci',
                                    scilimits=(-3, 3),
                                    fontsize=8)
                ax.yaxis.offsetText.set_fontsize(8)
                ax.xaxis.offsetText.set_fontsize(8)
                fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300)
                plt.close()

                # save predictions for all term-term pairs
                if save_predictions:
                    print('saving predictions for all term-term pairs...')
                    predictions = {}
                    X_all = term_term_stats_all.matrix.reshape(-1, 1)
                    if association_statistic == 'mcc':
                        X_all = (X_all + 1) / 2
                    xamin = (X_all[X_all > 0]).min() / 2
                    xamax = 1 - (1 - (X_all[X_all < 1]).max()) / 2
                    X_all[X_all == 0] = xamin
                    X_all[X_all == 1] = xamax
                    logitX_all = np.log10(X_all / (1 - X_all))
                    Z_all = robust_scaler.transform(logitX_all)
                    if logistic_regression_model.classes_[1] == 1:
                        predictions[
                            'decision_function'] = logistic_regression_model.decision_function(
                                Z_all)
                    else:
                        predictions[
                            'decision_function'] = -logistic_regression_model.decision_function(
                                Z_all)
                    predictions['probability_positive'] = 1 / (
                        1 + np.exp(-predictions['decision_function']))
                    if not np.all(np.diff(thresholds) > 0):
                        raise ValueError('thresholds not increasing')
                    predictions['precision'] = np.interp(
                        predictions['decision_function'], thresholds,
                        precision[:-1])
                    predictions['recall'] = np.interp(
                        predictions['decision_function'], thresholds,
                        recall[:-1])
                    I0, J0 = (term_term_counts_all.matrix <
                              min_count).nonzero()
                    IA, JA = (term_term_counts_all.matrix >=
                              min_count).nonzero()
                    new_stats = [
                        '{0}_dictidname'.format(row_dictionary),
                        '{0}_dictidname'.format(column_dictionary)
                    ]
                    new_stat_mat = np.concatenate(
                        (term_term_counts_all.rowlabels[IA].reshape(-1, 1),
                         term_term_counts_all.columnlabels[JA].reshape(-1, 1)),
                        1)
                    for stat, values in predictions.items():
                        term_term_stats_all.matrix = values.reshape(
                            term_term_stats_all.shape[0],
                            term_term_stats_all.shape[1])
                        term_term_stats_all.matrix[I0, J0] = 0
                        datasetIO.save_datamatrix(
                            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.txt.gz'
                            .format(row_dictionary, column_dictionary, stat,
                                    year, datestamp, min_score, universe,
                                    n_prior, min_count, association_statistic,
                                    row_term_dict, column_term_dict),
                            term_term_stats_all)
                        datasetIO.save_datamatrix(
                            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.pickle'
                            .format(row_dictionary, column_dictionary, stat,
                                    year, datestamp, min_score, universe,
                                    n_prior, min_count, association_statistic,
                                    row_term_dict, column_term_dict),
                            term_term_stats_all)
                        new_stats.append(stat)
                        new_stat_mat = np.append(
                            new_stat_mat,
                            (term_term_stats_all.matrix[IA,
                                                        JA]).reshape(-1, 1), 1)
                    new_df = pd.DataFrame(data=new_stat_mat, columns=new_stats)
                    dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format(
                        row_dictionary, column_dictionary, year, datestamp,
                        min_score, universe, n_prior, min_count)
                    joined_dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}_as_{8}_rd_{9}_cd_{10}.txt.gz'.format(
                        row_dictionary, column_dictionary, year, datestamp,
                        min_score, universe, n_prior, min_count,
                        association_statistic, row_term_dict, column_term_dict)
                    df = pd.read_table(dataframe_path,
                                       compression='gzip',
                                       index_col=False)
                    joined_df = df.set_index(new_stats[:2]).join(
                        new_df.set_index(new_stats[:2]))
                    joined_df.sort_values(by=association_statistic,
                                          ascending=False,
                                          inplace=True)
                    joined_df.to_csv(joined_dataframe_path,
                                     sep='\t',
                                     compression='gzip')

                # compute classifier performance statistics
                # note, these are in-sample statistics
                # we are not worried about overfitting
                # because we only have one feature
                # and we are not trying to build a rigorous ML model
                # we are simply trying to answer the question,
                # given a reference set of positive and negative examples,
                # which association statistic ranks term-term pairs the best?
                print('computing classifier performance statistics...')
                tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()

                # incorporate a random prior with effective sample size = n_prior
                prevalence = (tp + fn) / (tn + fp + fn + tp)
                tp += n_prior * prevalence / 2
                fn += n_prior * prevalence / 2
                tn += n_prior * (1 - prevalence) / 2
                fp += n_prior * (1 - prevalence) / 2

                ap = tp + fn
                an = fp + tn
                pp = tp + fp
                pn = tn + fn
                n = tn + fp + fn + tp

                tpr = tp / ap  # sensitivity, recall
                fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
                tnr = tn / an  # specificity
                fpr = fp / an  # 1-tnr, 1-specificity

                ppv = tp / pp  # precision
                fdr = fp / pp  # 1-ppv, 1-precision
                npv = tn / pn
                fomr = fn / pn  # 1-npv

                acc = (tp + tn) / n
                mcr = (fp + fn) / n  # 1-acc
                prev = ap / n

                plr = (tp / fp) / (
                    ap / an
                )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
                nlr = (fn / tn) / (
                    ap / an
                )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
                dor = (tp / fp) / (
                    fn / tn
                )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
                drr = (tp / pp) / (
                    fn / pn
                )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
                darr = (tp / pp) - (
                    fn / pn
                )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
                mrr = (tp / pp) / (
                    ap / n
                )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
                marr = (tp / pp) - (
                    ap / n
                )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

                f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr)
                mcc = (tp * tn - fp * fn) / np.sqrt(
                    (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
                cos = tp / np.sqrt((tp + fp) * (tp + fn))  # ochiai
                fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)

                lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(
                    fn) + np.log10(fn + tn)  # log10 of relative risk
                lrr_se = np.sqrt(
                    fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log(
                        10)  # standard error of log10 of relative risk
                lrr_lb95 = lrr - 1.96 * lrr_se
                lrr_ub95 = lrr + 1.96 * lrr_se
                drr_lb95 = 10**lrr_lb95
                drr_ub95 = 10**lrr_ub95

                lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10(
                    tn)  # log10 of odds ratio
                lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log(
                    10)  # standard error of log10 of odds ratio
                lor_lb95 = lor - 1.96 * lor_se
                lor_ub95 = lor + 1.96 * lor_se
                dor_lb95 = 10**lor_lb95
                dor_ub95 = 10**lor_ub95

                mi, nmi, iqr = mutualinformation(
                    tp, fp, fn, tn
                )  # mutual information, normalized mutual information, information quality ratio

                # write to dataframe
                print('writing to dataframe...')
                count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n]
                other_stats = [
                    auroc, auprc, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc,
                    mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc,
                    cos, fnlp, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95,
                    drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95,
                    dor_ub95, mi, nmi, iqr, min_value_association_statistic
                ]

                writelist = [
                    row_dictionary, column_dictionary, year, datestamp,
                    str(min_score), universe,
                    str(n_prior),
                    str(min_count), association_statistic,
                    reference_datamatrix_path, row_term_dict, column_term_dict
                ]
                writelist += [str(s) for s in count_stats]
                writelist += ['{0:1.5g}'.format(s) for s in other_stats]
                fw.write('\t'.join(writelist) + '\n')

    print('done benchmark_term-term_stats_from_termite.py')
Beispiel #14
0
dm2.columnmeta['feature'] = dm2.columnlabels.copy()
dm2.columnname = 'feature|dataset'
dm2.columnlabels = dm2.columnmeta['feature'] + '|' + dm2.columnmeta['dataset']

# merge datasets
print('merging datasets...', flush=True)
dm = dm1.concatenate(dm2, 'self', 1)
dm.rowmeta['in_dm1'] = in_dm1.copy()
dm.rowmeta['in_' + dm1_name] = in_dm1.copy()
dm.rowmeta['in_dm2'] = in_dm2.copy()
dm.rowmeta['in_' + dm2_name] = in_dm2.copy()
dm.columnmeta['in_dm1'] = dm.columnmeta['dataset'] == dm1_name
dm.columnmeta['in_' + dm1_name] = dm.columnmeta['dataset'] == dm1_name
dm.columnmeta['in_dm2'] = dm.columnmeta['dataset'] == dm2_name
dm.columnmeta['in_' + dm2_name] = dm.columnmeta['dataset'] == dm2_name
dm.matrixname = dm1_name + '_' + dm2_name + '_merged'
print(dm, flush=True)
print(dm.rowmeta.keys(), flush=True)
print(dm.columnmeta.keys(), flush=True)

# save the data
print('saving merged data...', flush=True)
savefolder = '../../input_data/{0}_{1}'.format(dm1_name, dm2_name)
if not os.path.exists(savefolder):
    os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, dm)
datasetIO.save_datamatrix('{0}/datamatrix.pickle'.format(savefolder), dm)
datasetIO.save_datamatrix('{0}/datamatrix.txt.gz'.format(savefolder), dm)

print('done.', flush=True)
def main(validation_rep=0, validation_fold=0):

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format(
        validation_rep, validation_fold)
    dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0]

    # load validation examples
    print('loading validation examples...', flush=True)
    validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format(
        validation_rep, validation_fold)
    with open(validation_examples_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        validation_examples = fr.read().split('\n')

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format(
        validation_rep, validation_fold)
    results_folder_parts = results_folder.split('/')
    for i in range(len(results_folder_parts)):
        results_folder_part = '/'.join(results_folder_parts[:i + 1])
        if not os.path.exists(results_folder_part):
            os.mkdir(results_folder_part)

    # load dataset
    print('loading dataset {0}...'.format(dataset_info['abbreviation']),
          flush=True)
    gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])

    # specify cross-validation parameters
    print('specifying cross-validation parameters...', flush=True)
    reps = 20
    folds = 5
    rf_trees = 1000
    include_logistic_regression = True
    skf = StratifiedKFold(n_splits=folds, shuffle=True)
    print('    reps: {0!s}'.format(reps))
    print('    folds: {0!s}'.format(folds))

    # initialize models
    print('initializing models...', flush=True)
    rfmodel = RandomForestClassifier(n_estimators=rf_trees,
                                     oob_score=False,
                                     n_jobs=-1,
                                     class_weight='balanced')
    print(rfmodel)
    lrmodel = LogisticRegression(penalty='l2',
                                 dual=False,
                                 tol=0.0001,
                                 C=1e3,
                                 fit_intercept=True,
                                 intercept_scaling=1e3,
                                 class_weight='balanced',
                                 random_state=None,
                                 solver='liblinear',
                                 max_iter=100,
                                 multi_class='ovr',
                                 verbose=0,
                                 warm_start=False,
                                 n_jobs=1)
    print(lrmodel)

    # initialize data matrices for collecting model feature importances and cross-validation performance stats
    print(
        'initializing data matrices for collecting model feature importances and cross-validation performance stats...',
        flush=True)
    classifier_stats = np.array([
        'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr',
        'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr',
        'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc',
        'fnlp'
    ],
                                dtype='object')
    sm = dataclasses.datamatrix(
        rowname='classifier_performance_stat',
        rowlabels=classifier_stats.copy(),
        rowmeta={},
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='crossvalidation_classifier_performance_stats_vs_models',
        matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]),
                        dtype='float64'))
    stat_model_rf_mean = copy.deepcopy(sm)
    stat_model_rf_stdv = copy.deepcopy(sm)
    stat_model_lr_mean = copy.deepcopy(sm)
    stat_model_lr_stdv = copy.deepcopy(sm)
    del sm
    fm = dataclasses.datamatrix(
        rowname=gene_atb.columnname,
        rowlabels=gene_atb.columnlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb.columnmeta),
        columnname='model',
        columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])],
                              dtype='object'),
        columnmeta={
            'num_features': np.zeros(gene_atb.shape[1], dtype='int64'),
            'features': np.full(gene_atb.shape[1], '', dtype='object'),
            'oob_score': np.zeros(gene_atb.shape[1], dtype='float64')
        },
        matrixname='model_feature_importances',
        matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]),
                        dtype='float64'))
    feature_model_rf = copy.deepcopy(fm)
    feature_model_lr = copy.deepcopy(fm)
    del fm

    # exclude validation and unlabeled examples from cross-validation loop
    print(
        'excluding validation and unlabeled examples from cross-validation loop...',
        flush=True)
    isvalidation = np.in1d(gene_atb.rowlabels, validation_examples)
    isunknown = gene_atb.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    Y = (gene_atb.rowmeta['class'][istraintest] == 'positive')
    #X = gene_atb.matrix[istraintest,:]

    # perform incremental feature elimination with cross-validation
    print(
        'performing incremental feature elimination with cross-validation...',
        flush=True)
    for i in range(gene_atb.shape[1]):
        print('    features: {0!s}...'.format(gene_atb.shape[1] - i),
              flush=True)
        if i == 0:
            hit_rf = np.ones(gene_atb.shape[1], dtype='bool')
            hit_lr = np.ones(gene_atb.shape[1], dtype='bool')
        else:
            hit_rf = feature_model_rf.matrix[:,
                                             i - 1] > feature_model_rf.matrix[
                                                 feature_model_rf.
                                                 matrix[:, i - 1] > 0,
                                                 i - 1].min()
            #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min()
            hit_lr = hit_rf
        X_rf = gene_atb.matrix[istraintest, :][:, hit_rf]
        X_lr = gene_atb.matrix[istraintest, :][:, hit_lr]
        stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64')
        stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64')
        fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64')
        fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64')
        for rep in range(reps):
            print('        rep {0!s} of {1!s}...'.format(rep + 1, reps),
                  flush=True)
            Ptest_rf = np.zeros(Y.size, dtype='float64')
            Ptest_lr = np.zeros(Y.size, dtype='float64')
            fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64')
            fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64')
            for fold, (train_indices,
                       test_indices) in enumerate(skf.split(X_rf, Y)):
                print('            fold {0!s} of {1!s}...'.format(
                    fold + 1, folds),
                      flush=True)
                Y_train = Y[train_indices]
                X_rf_train = X_rf[train_indices]
                X_lr_train = X_lr[train_indices]
                #Y_test = Y[test_indices]
                X_rf_test = X_rf[test_indices]
                X_lr_test = X_lr[test_indices]
                rfmodel.fit(X_rf_train, Y_train)
                Ptest_rf[test_indices] = rfmodel.predict_proba(
                    X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1)
                fi_fold_rf[:, fold] = rfmodel.feature_importances_
                lrmodel.fit(X_lr_train, Y_train)
                Ptest_lr[test_indices] = lrmodel.predict_proba(
                    X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1)
                fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1))
            fi_rep_rf[:, rep] = fi_fold_rf.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_rf,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
            fi_rep_lr[:, rep] = fi_fold_lr.mean(1)
            stat_cut = modelevaluation.get_classifier_performance_stats(
                Y=Y,
                P=Ptest_lr,
                classifier_stats=classifier_stats,
                plot_curves=False,
                get_priority_cutoffs=True)
            stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[
                'p50_cutoff']].reshape(-1)
        feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1)
        feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_rf.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1)
        stat_model_rf_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1)
        stat_model_rf_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_rf_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_rf].tolist())
        feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1)
        feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i
        feature_model_lr.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1)
        stat_model_lr_mean.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_mean.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())
        stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1)
        stat_model_lr_stdv.columnmeta['num_features'][
            i] = gene_atb.shape[1] - i
        stat_model_lr_stdv.columnmeta['features'][i] = '|'.join(
            gene_atb.columnlabels[hit_lr].tolist())

    # concatenate data matrices with model feature importances
    print('concatenating data matrices with model feature importances...',
          flush=True)
    feature_model_rf.columnlabels += '_rf'
    feature_model_rf.columnmeta['model_type'] = np.full(
        feature_model_rf.shape[1], 'random_forest', dtype='object')
    feature_model_lr.columnlabels += '_lr'
    feature_model_lr.columnmeta['model_type'] = np.full(
        feature_model_lr.shape[1], 'logistic_regression', dtype='object')
    feature_model_rf.append(feature_model_lr, 1)
    feature_model = feature_model_rf
    del feature_model_rf, feature_model_lr

    # concatenate data matrices with model cross-validation performance stats
    print(
        'concatenating data matrices with model cross-validation performance stats...',
        flush=True)
    stat_model_rf_mean.rowlabels += '_mean'
    stat_model_rf_stdv.rowlabels += '_stdv'
    stat_model_rf_mean.append(stat_model_rf_stdv, 0)
    stat_model_rf_mean.columnlabels += '_rf'
    stat_model_rf_mean.columnmeta['model_type'] = np.full(
        stat_model_rf_mean.shape[1], 'random_forest', dtype='object')
    stat_model_lr_mean.rowlabels += '_mean'
    stat_model_lr_stdv.rowlabels += '_stdv'
    stat_model_lr_mean.append(stat_model_lr_stdv, 0)
    stat_model_lr_mean.columnlabels += '_lr'
    stat_model_lr_mean.columnmeta['model_type'] = np.full(
        stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object')
    stat_model_rf_mean.append(stat_model_lr_mean, 1)
    stat_model = stat_model_rf_mean
    del stat_model_rf_mean

    # select simplest model (fewest features) with auroc and auprc within 95% of max
    print(
        'selecting simplest model (fewest features) with auroc and auprc within 95% of max...',
        flush=True)
    model_scores = 0.5 * (stat_model.select('auroc_mean', []) +
                          stat_model.select('auprc_mean', []))
    if include_logistic_regression:
        selected_model_index = np.where(
            model_scores >= 0.95 * model_scores.max())[0][-1]
    else:
        selected_model_index = np.where(
            np.logical_and(
                model_scores >=
                0.95 * model_scores[stat_model.columnmeta['model_type'] ==
                                    'random_forest'].max(),
                stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1]
    selected_model_name = stat_model.columnlabels[selected_model_index]
    selected_model_features = feature_model.rowlabels[
        feature_model.matrix[:, selected_model_index] != 0]
    selected_model_type = stat_model.columnmeta['model_type'][
        selected_model_index]
    selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel
    gene_atb = gene_atb.tolabels(columnlabels=selected_model_features)
    feature_model_selected = feature_model.tolabels(
        columnlabels=selected_model_name)
    stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name)
    print('    selected_model_name: {0}'.format(selected_model_name),
          flush=True)
    print('    selected_model_features: {0}'.format(
        '|'.join(selected_model_features)),
          flush=True)

    # iterate over selected features to rebuild design matrix
    print('iterating over selected features to rebuild design matrix...',
          flush=True)
    for i, (selected_feature, dataset_abbreviation) in enumerate(
            zip(gene_atb.columnlabels,
                gene_atb.columnmeta['dataset_abbreviation'])):

        # load dataset
        print('    loading dataset {0}...'.format(dataset_abbreviation),
              flush=True)
        dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format(
            validation_rep, validation_fold, dataset_abbreviation)
        gene_atb_i = datasetIO.load_datamatrix(dataset_path)
        gene_atb_i.columnmeta[
            'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[
                'generalizability_pvalues_corrected'].astype('float64')
        gene_atb_i.columnmeta['dataset_abbreviation'] = np.full(
            gene_atb_i.shape[1], dataset_abbreviation, dtype='object')
        gene_atb_i.columnmeta[
            'dataset_feature'] = gene_atb_i.columnlabels.copy()
        gene_atb_i.columnlabels += '_' + dataset_abbreviation
        gene_atb_i.rowname = 'GeneSym'
        gene_atb_i.columnname = 'Feature'
        if dataset_abbreviation == 'gtextissue_cleaned':
            gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55',
                               0)  # pesky duplicate row
        print(gene_atb_i)

        # select feature
        print('    selecting feature {0}...'.format(selected_feature),
              flush=True)
        gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1)

        # merge dataset
        print('    merging dataset...', flush=True)
        if i == 0:
            gene_atb_selected = copy.deepcopy(gene_atb_i)
            gene_atb_selected.matrixname = 'merged_target_features'
            print('        first dataset, no merge...', flush=True)
        else:
            common_genes = np.intersect1d(gene_atb_selected.rowlabels,
                                          gene_atb_i.rowlabels)
            gene_atb_selected = gene_atb_selected.tolabels(
                rowlabels=common_genes)
            gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes)
            gene_atb_selected.append(gene_atb_i, 1)
            print('        common_genes: {0!s}...'.format(common_genes.size),
                  flush=True)

    # normalize features
    print('normalizing features...', flush=True)
    gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0)
    gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0)
    gene_atb_selected.matrix = (
        gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape(
            1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) -
                       gene_atb_selected.columnmeta['min'].reshape(1, -1))

    # update metadata
    print('updating metadata...', flush=True)
    assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all()
    for field, values in gene_atb.columnmeta.items():
        if field not in gene_atb_selected.columnmeta:
            gene_atb_selected.columnmeta[field] = values
    print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format(
        gene_atb.shape[0], gene_atb_selected.shape[0]),
          flush=True)
    del gene_atb

    # refit selected model
    print('refitting selected model...', flush=True)
    isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples)
    isunknown = gene_atb_selected.rowmeta['class'] == 'unknown'
    istraintest = ~np.logical_or(isvalidation, isunknown)
    selected_model.fit(
        gene_atb_selected.matrix[istraintest, :],
        gene_atb_selected.rowmeta['class'][istraintest] == 'positive')

    # get predictions for validation and unlabelled examples
    print('getting predictions for validation and unlabelled examples...',
          flush=True)
    gene_model_selected = dataclasses.datamatrix(
        rowname=gene_atb_selected.rowname,
        rowlabels=gene_atb_selected.rowlabels.copy(),
        rowmeta=copy.deepcopy(gene_atb_selected.rowmeta),
        columnname=stat_model_selected.columnname,
        columnlabels=stat_model_selected.columnlabels.copy(),
        columnmeta=copy.deepcopy(stat_model_selected.columnmeta),
        matrixname=
        'success_probabilities_for_validation_and_unlabelled_examples',
        matrix=selected_model.predict_proba(
            gene_atb_selected.matrix)[:, selected_model.classes_ == 1])
    gene_model_selected.discard(istraintest, 0)

    # save results
    print('saving {0!s} useful features and model results...'.format(
        gene_atb_selected.shape[1]),
          flush=True)
    dataset_info['path'] = '{0}/{1}.txt.gz'.format(
        results_folder, dataset_info['abbreviation'])
    dataset_info['selected_model_name'] = selected_model_name
    dataset_info['selected_model_features'] = '|'.join(selected_model_features)
    dataset_info['selected_model_type'] = selected_model_type
    dataset_info['crossvalidation_reps'] = reps
    dataset_info['crossvalidation_folds'] = folds
    dataset_info['rf_trees'] = rf_trees
    dataset_info['include_logistic_regression'] = include_logistic_regression
    for stat_name, stat_values in zip(stat_model_selected.rowlabels,
                                      stat_model_selected.matrix):
        dataset_info[stat_name] = stat_values.item()
    datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected)
    datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder),
                              stat_model)
    datasetIO.save_datamatrix(
        '{0}/feature_model.txt.gz'.format(results_folder), feature_model)
    datasetIO.save_datamatrix(
        '{0}/stat_model_selected.txt.gz'.format(results_folder),
        stat_model_selected)
    datasetIO.save_datamatrix(
        '{0}/feature_model_selected.txt.gz'.format(results_folder),
        feature_model_selected)
    datasetIO.save_datamatrix(
        '{0}/gene_model_selected.txt.gz'.format(results_folder),
        gene_model_selected)
    datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder),
                                 dataset_info)

    print('done.', flush=True)
def main(adjustments_path):

    # read adjustments
    print('reading adjustments...', flush=True)
    designpath_selectedstep = {}
    with open(adjustments_path,
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        for line in fr:
            design_path, selected_step = [x.strip() for x in line.split('\t')]
            designpath_selectedstep[design_path] = int(selected_step)
    print('found {0!s} adjustments...'.format(len(designpath_selectedstep)),
          flush=True)

    # make adjustments
    print('making adjustments...', flush=True)
    for didx, (design_path,
               selected_step) in enumerate(designpath_selectedstep.items()):
        print('working on {0}...'.format(design_path), flush=True)
        print('selected step:{0!s}...'.format(selected_step), flush=True)

        # load design
        print('loading design...', flush=True)
        with open(design_path,
                  mode='rt',
                  encoding='utf-8',
                  errors='surrogateescape') as fr:
            d = json.load(fr)
        if 'apply_activation_to_embedding' not in d:  # for legacy code
            d['apply_activation_to_embedding'] = True
        if 'use_batchnorm' not in d:  # for legacy code
            d['use_batchnorm'] = False
        if 'skip_layerwise_training' not in d:  # for legacy code
            d['skip_layerwise_training'] = False
        phase = d['training_schedule'][-1]
        d['current_hidden_layer'] = phase['hidden_layer']
        d['current_finetuning_run'] = phase['finetuning_run']
        d['current_epochs'] = phase['epochs']

        # load data
        if didx == 0:
            print('loading data...', flush=True)
            partitions = ['train', 'valid', 'test']
            dataset = {}
            for partition in partitions:
                dataset[partition] = datasetIO.load_datamatrix(
                    '{0}/{1}.pickle'.format(d['input_path'], partition))
                if 'all' not in dataset:
                    dataset['all'] = copy.deepcopy(dataset[partition])
                else:
                    dataset['all'].append(dataset[partition], 0)

            # get parameters for marginal distributions
            # will sample from marginal distributions to impute missing values
            # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli')
            # for other features, model as gaussian
            marginalprobabilities = (
                1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / (
                    2 + np.sum(
                        ~np.isnan(dataset['train'].matrix), 0, keepdims=True)
                )  # posterior mean of beta-bernoulli with prior a=b=1
            marginalstdvs = np.nanstd(dataset['train'].matrix,
                                      0,
                                      keepdims=True)
            isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] ==
                                   'bernoulli').astype('float64').reshape(
                                       1, -1)

        # finish configuration
        print('finishing configuration...', flush=True)

        # specify activation function
        if d['activation_function'] == 'tanh':
            activation_function = {'np': tsdae_apply_functions.tanh}
        elif d['activation_function'] == 'relu':
            activation_function = {'np': tsdae_apply_functions.relu}
        elif d['activation_function'] == 'elu':
            activation_function = {'np': tsdae_apply_functions.elu}
        elif d['activation_function'] == 'sigmoid':
            activation_function = {'np': tsdae_apply_functions.sigmoid}

        # initialize model architecture (number of layers and dimension of each layer)
        d['current_dimensions'] = d[
            'all_dimensions'][:d['current_hidden_layer'] +
                              1]  # dimensions of model up to current depth

        # specify embedding function for current training phase
        # we want the option of skipping the embedding activation function to apply only to the full model
        if not d['apply_activation_to_embedding'] and d[
                'current_dimensions'] == d['all_dimensions']:
            d['current_apply_activation_to_embedding'] = False
        else:
            d['current_apply_activation_to_embedding'] = True
        print('current_apply_activation_to_embedding: {0!s}'.format(
            d['current_apply_activation_to_embedding']),
              flush=True)

        # specify rows and columns of figure showing data reconstructions
        d['reconstruction_rows'] = int(
            np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2)))
        d['reconstruction_cols'] = 2 * d['reconstruction_rows']

        # move files
        print('moving files...', flush=True)
        if os.path.exists(
                '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step)):
            if os.path.exists(
                    '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                        d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'])):
                shutil.move(
                    '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                        d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run']),
                    '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'.
                    format(d['output_path'], d['current_hidden_layer'],
                           d['current_finetuning_run']))
            shutil.copyfile(
                '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                    d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run']))
        else:
            print('variables do no exist for selected step! skipping...',
                  flush=True)
            continue
        if d['use_batchnorm']:
            if os.path.exists(
                    '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step)):
                if os.path.exists(
                        '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'
                        .format(d['output_path'], d['current_hidden_layer'],
                                d['current_finetuning_run'])):
                    shutil.move(
                        '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'
                        .format(d['output_path'], d['current_hidden_layer'],
                                d['current_finetuning_run']),
                        '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle'
                        .format(d['output_path'], d['current_hidden_layer'],
                                d['current_finetuning_run']))
                shutil.copyfile(
                    '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run'], selected_step),
                    '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run']))
            else:
                print(
                    'batchnorm variables do no exist for selected step! skipping...',
                    flush=True)
                continue

        # load model variables
        print('loading model variables...', flush=True)
        with open(
                '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                    d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run']), 'rb') as fr:
            W, Be, Bd = pickle.load(fr)[1:]  # global_step, W, bencode, bdecode
        if d['use_batchnorm']:
            with open(
                    '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run']), 'rb') as fr:
                batchnorm_variables = pickle.load(
                    fr)  # gammas, betas, moving_means, moving_variances
            batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables(
                batchnorm_variables,
                d['current_apply_activation_to_embedding'],
                d['apply_activation_to_output'])

        # load reporting variables
        print('loading reporting variables...', flush=True)
        if os.path.exists(
                '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run'])):
            with open(
                    '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.
                    format(d['output_path'], d['current_hidden_layer'],
                           d['current_finetuning_run']), 'rb') as fr:
                optimization_path = pickle.load(fr)
            reporting_steps = optimization_path['reporting_steps']
            valid_losses = optimization_path['valid_losses']
            train_losses = optimization_path['train_losses']
            valid_noisy_losses = optimization_path['valid_noisy_losses']
            train_noisy_losses = optimization_path['train_noisy_losses']
        else:
            reporting_steps = np.zeros(0, dtype='int32')
            valid_losses = np.zeros(0, dtype='float32')
            train_losses = np.zeros(0, dtype='float32')
            valid_noisy_losses = np.zeros(0, dtype='float32')
            train_noisy_losses = np.zeros(0, dtype='float32')
            with open(
                    '{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(
                        d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run']), 'rt') as fr:
                fr.readline()
                for line in fr:
                    step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [
                        float(x.strip()) for x in line.split('\t')
                    ]
                    reporting_steps = np.insert(reporting_steps,
                                                reporting_steps.size, step)
                    valid_losses = np.insert(valid_losses, valid_losses.size,
                                             valid_loss)
                    train_losses = np.insert(train_losses, train_losses.size,
                                             train_loss)
                    valid_noisy_losses = np.insert(valid_noisy_losses,
                                                   valid_noisy_losses.size,
                                                   valid_noisy_loss)
                    train_noisy_losses = np.insert(train_noisy_losses,
                                                   train_noisy_losses.size,
                                                   train_noisy_loss)
            with open(
                    '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.
                    format(d['output_path'], d['current_hidden_layer'],
                           d['current_finetuning_run']), 'wb') as fw:
                pickle.dump(
                    {
                        'reporting_steps': reporting_steps,
                        'valid_losses': valid_losses,
                        'train_losses': train_losses,
                        'valid_noisy_losses': valid_noisy_losses,
                        'train_noisy_losses': train_noisy_losses
                    }, fw)

        # compute embedding and reconstruction
        print('computing embedding and reconstruction...', flush=True)
        recon = {}
        embed = {}
        error = {}
        embed_preactivation = {}
        for partition in ['all']:
            if np.isnan(dataset[partition].matrix).any():
                print('datamatrix has missing values. random imputation...',
                      flush=True)
                dp = copy.deepcopy(dataset[partition])
                is_missing = np.isnan(dp.matrix)
                for i in range(5):
                    print('impute iteration {0!s}'.format(i), flush=True)
                    normal_noise = np.random.randn(dp.shape[0],
                                                   dp.shape[1]) * marginalstdvs
                    bernoulli_noise = (np.random.rand(dp.shape[0],
                                                      dp.shape[1]) <=
                                       marginalprobabilities).astype('float64')
                    noise = bernoulli_noise * isbernoullimarginal + normal_noise * (
                        1 - isbernoullimarginal)
                    dp.matrix[is_missing] = noise[is_missing]
                    if i == 0:
                        if d['use_batchnorm']:
                            recon[partition], embed[partition], error[
                                partition] = tsdae_apply_functions.encode_and_decode(
                                    dp,
                                    W,
                                    Be,
                                    Bd,
                                    activation_function['np'],
                                    d['current_apply_activation_to_embedding'],
                                    d['apply_activation_to_output'],
                                    dataset['train'].columnmeta['likelihood']
                                    == 'bernoulli',
                                    return_embedding=True,
                                    return_reconstruction_error=True,
                                    bn_encode_variables=
                                    batchnorm_encode_variables,
                                    bn_decode_variables=
                                    batchnorm_decode_variables)
                            if d['current_apply_activation_to_embedding']:
                                embed_preactivation[
                                    partition] = tsdae_apply_functions.encode(
                                        dp,
                                        W,
                                        Be,
                                        activation_function['np'],
                                        apply_activation_to_embedding=False,
                                        bn_variables=batchnorm_encode_variables
                                    )
                        else:
                            recon[partition], embed[partition], error[
                                partition] = tsdae_apply_functions.encode_and_decode(
                                    dp,
                                    W,
                                    Be,
                                    Bd,
                                    activation_function['np'],
                                    d['current_apply_activation_to_embedding'],
                                    d['apply_activation_to_output'],
                                    dataset['train'].columnmeta['likelihood']
                                    == 'bernoulli',
                                    return_embedding=True,
                                    return_reconstruction_error=True)
                            if d['current_apply_activation_to_embedding']:
                                embed_preactivation[
                                    partition] = tsdae_apply_functions.encode(
                                        dp,
                                        W,
                                        Be,
                                        activation_function['np'],
                                        apply_activation_to_embedding=False)
                    else:
                        if d['use_batchnorm']:
                            reconi, embedi, errori = tsdae_apply_functions.encode_and_decode(
                                dp,
                                W,
                                Be,
                                Bd,
                                activation_function['np'],
                                d['current_apply_activation_to_embedding'],
                                d['apply_activation_to_output'],
                                dataset['train'].columnmeta['likelihood'] ==
                                'bernoulli',
                                return_embedding=True,
                                return_reconstruction_error=True,
                                bn_encode_variables=batchnorm_encode_variables,
                                bn_decode_variables=batchnorm_decode_variables)
                            if d['current_apply_activation_to_embedding']:
                                embed_preactivationi = tsdae_apply_functions.encode(
                                    dp,
                                    W,
                                    Be,
                                    activation_function['np'],
                                    apply_activation_to_embedding=False,
                                    bn_variables=batchnorm_encode_variables)
                        else:
                            reconi, embedi, errori = tsdae_apply_functions.encode_and_decode(
                                dp,
                                W,
                                Be,
                                Bd,
                                activation_function['np'],
                                d['current_apply_activation_to_embedding'],
                                d['apply_activation_to_output'],
                                dataset['train'].columnmeta['likelihood'] ==
                                'bernoulli',
                                return_embedding=True,
                                return_reconstruction_error=True)
                            if d['current_apply_activation_to_embedding']:
                                embed_preactivationi = tsdae_apply_functions.encode(
                                    dp,
                                    W,
                                    Be,
                                    activation_function['np'],
                                    apply_activation_to_embedding=False)
                        recon[partition].matrix += reconi.matrix
                        embed[partition].matrix += embedi.matrix
                        error[partition] += errori
                        if d['current_apply_activation_to_embedding']:
                            embed_preactivation[
                                partition].matrix += embed_preactivationi.matrix
                recon[partition].matrix /= 5
                embed[partition].matrix /= 5
                error[partition] /= 5
                if d['current_apply_activation_to_embedding']:
                    embed_preactivation[partition].matrix /= 5
            else:
                if d['use_batchnorm']:
                    recon[partition], embed[partition], error[
                        partition] = tsdae_apply_functions.encode_and_decode(
                            dataset[partition],
                            W,
                            Be,
                            Bd,
                            activation_function['np'],
                            d['current_apply_activation_to_embedding'],
                            d['apply_activation_to_output'],
                            dataset['train'].columnmeta['likelihood'] ==
                            'bernoulli',
                            return_embedding=True,
                            return_reconstruction_error=True,
                            bn_encode_variables=batchnorm_encode_variables,
                            bn_decode_variables=batchnorm_decode_variables)
                    if d['current_apply_activation_to_embedding']:
                        embed_preactivation[
                            partition] = tsdae_apply_functions.encode(
                                dataset[partition],
                                W,
                                Be,
                                activation_function['np'],
                                apply_activation_to_embedding=False,
                                bn_variables=batchnorm_encode_variables)
                else:
                    recon[partition], embed[partition], error[
                        partition] = tsdae_apply_functions.encode_and_decode(
                            dataset[partition],
                            W,
                            Be,
                            Bd,
                            activation_function['np'],
                            d['current_apply_activation_to_embedding'],
                            d['apply_activation_to_output'],
                            dataset['train'].columnmeta['likelihood'] ==
                            'bernoulli',
                            return_embedding=True,
                            return_reconstruction_error=True)
                    if d['current_apply_activation_to_embedding']:
                        embed_preactivation[
                            partition] = tsdae_apply_functions.encode(
                                dataset[partition],
                                W,
                                Be,
                                activation_function['np'],
                                apply_activation_to_embedding=False)
            print('{0} reconstruction error: {1:1.3g}'.format(
                partition, error[partition]),
                  flush=True)

        for partition in partitions:
            recon[partition] = recon['all'].tolabels(
                rowlabels=dataset[partition].rowlabels.copy())
            embed[partition] = embed['all'].tolabels(
                rowlabels=dataset[partition].rowlabels.copy())
            if d['current_apply_activation_to_embedding']:
                embed_preactivation[partition] = embed_preactivation[
                    'all'].tolabels(
                        rowlabels=dataset[partition].rowlabels.copy())
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            if d['current_apply_activation_to_embedding']:
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])

        # plot loss
        print('plotting loss...', flush=True)
        fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25))
        ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25])
        ax.semilogx(reporting_steps,
                    train_losses,
                    ':r',
                    linewidth=1,
                    label='train')
        ax.semilogx(reporting_steps,
                    valid_losses,
                    '-g',
                    linewidth=1,
                    label='valid')
        ax.semilogx(reporting_steps,
                    train_noisy_losses,
                    '--b',
                    linewidth=1,
                    label='train,noisy')
        ax.semilogx(reporting_steps,
                    valid_noisy_losses,
                    '-.k',
                    linewidth=1,
                    label='valid,noisy')
        ax.legend(loc='best', fontsize=8)
        ax.set_ylabel('loss', fontsize=8)
        ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step),
                      fontsize=8)
        ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1)
        ax.set_ylim(0, 10)
        ax.tick_params(axis='both',
                       which='major',
                       left=True,
                       right=True,
                       bottom=True,
                       top=False,
                       labelleft=True,
                       labelright=False,
                       labelbottom=True,
                       labeltop=False,
                       labelsize=8)
        fg.savefig(
            '{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']),
            transparent=True,
            pad_inches=0,
            dpi=600)
        plt.close()

        # plot reconstructions
        print('plotting reconstructions...', flush=True)
        num_recons = min([
            d['reconstruction_rows'] * d['reconstruction_cols'],
            dataset['valid'].shape[0]
        ])
        x_valid = dataset[
            'valid'].matrix[:num_recons, dataset['train'].
                            columnmeta['likelihood'] != 'bernoulli']
        xr_valid = recon[
            'valid'].matrix[:num_recons, dataset['train'].
                            columnmeta['likelihood'] != 'bernoulli']
        if x_valid.shape[1] > 1000:
            x_valid = x_valid[:, :1000]
            xr_valid = xr_valid[:, :1000]
        lb = np.nanmin(np.append(x_valid, xr_valid, 1), 1)
        ub = np.nanmax(np.append(x_valid, xr_valid, 1), 1)
        fg, axs = plt.subplots(2 * d['reconstruction_rows'],
                               d['reconstruction_cols'],
                               figsize=(6.5, 6.5))
        for i, ax in enumerate(
                axs.reshape(-1)[:d['reconstruction_rows'] *
                                d['reconstruction_cols']]):
            hit = np.logical_and(np.isfinite(x_valid[i, :]),
                                 np.isfinite(xr_valid[i, :]))
            if i < num_recons and hit.any():
                ax.plot(x_valid[i, hit],
                        xr_valid[i, hit],
                        'ok',
                        markersize=0.5,
                        markeredgewidth=0,
                        alpha=0.1)
                ax.set_ylim(lb[i], ub[i])
                ax.set_xlim(lb[i], ub[i])
                ax.tick_params(axis='both',
                               which='major',
                               left=False,
                               right=False,
                               bottom=False,
                               top=False,
                               labelleft=False,
                               labelright=False,
                               labelbottom=False,
                               labeltop=False,
                               pad=4)
                ax.set_frame_on(False)
                ax.axvline(lb[i], linewidth=1, color='k')
                ax.axvline(ub[i], linewidth=1, color='k')
                ax.axhline(lb[i], linewidth=1, color='k')
                ax.axhline(ub[i], linewidth=1, color='k')
            else:
                fg.delaxes(ax)
        x_valid = dataset['valid'].matrix[:num_recons, dataset['train'].
                                          columnmeta['likelihood'] ==
                                          'bernoulli']
        xr_valid = recon['valid'].matrix[:num_recons, dataset['train'].
                                         columnmeta['likelihood'] ==
                                         'bernoulli']
        if x_valid.shape[1] > 1000:
            x_valid = x_valid[:, :1000]
            xr_valid = xr_valid[:, :1000]
        lb = -0.1
        ub = 1.1
        for i, ax in enumerate(
                axs.reshape(-1)[d['reconstruction_rows'] *
                                d['reconstruction_cols']:]):
            hit = np.logical_and(np.isfinite(x_valid[i, :]),
                                 np.isfinite(xr_valid[i, :]))
            if i < num_recons and hit.any():
                ax.boxplot([
                    xr_valid[i, x_valid[i, :] == 0],
                    xr_valid[i, x_valid[i, :] == 1]
                ],
                           positions=[0.2, 0.8],
                           flierprops={
                               'markersize': 0.5,
                               'markeredgewidth': 0,
                               'alpha': 0.1
                           },
                           boxprops={'linewidth': 0.5},
                           whiskerprops={'linewidth': 0.5},
                           medianprops={'linewidth': 0.5})
                ax.set_ylim(lb, ub)
                ax.set_xlim(lb, ub)
                ax.tick_params(axis='both',
                               which='major',
                               left=False,
                               right=False,
                               bottom=False,
                               top=False,
                               labelleft=False,
                               labelright=False,
                               labelbottom=False,
                               labeltop=False,
                               pad=4)
                ax.set_frame_on(False)
                ax.axvline(lb, linewidth=1, color='k')
                ax.axvline(ub, linewidth=1, color='k')
                ax.axhline(lb, linewidth=1, color='k')
                ax.axhline(ub, linewidth=1, color='k')
            else:
                fg.delaxes(ax)
        fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(
            d['output_path'], d['current_hidden_layer'],
            d['current_finetuning_run']),
                   transparent=True,
                   pad_inches=0,
                   dpi=1200)
        plt.close()

        # plot 2d embedding
        if d['current_dimensions'][-1] == 2 and (
                not d['use_finetuning'] or d['current_finetuning_run'] > 0):
            print('plotting 2d embedding...', flush=True)
            fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
            ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
            ax.plot(embed['train'].matrix[:, 0],
                    embed['train'].matrix[:, 1],
                    'ok',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=0.5,
                    zorder=0)
            ax.plot(embed['valid'].matrix[:, 0],
                    embed['valid'].matrix[:, 1],
                    'or',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=1.0,
                    zorder=1)
            ax.tick_params(axis='both',
                           which='major',
                           bottom=False,
                           top=False,
                           labelbottom=False,
                           labeltop=False,
                           left=False,
                           right=False,
                           labelleft=False,
                           labelright=False,
                           pad=4)
            ax.set_frame_on(False)
            fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']),
                       transparent=True,
                       pad_inches=0,
                       dpi=600)
            plt.close()

            if d['current_apply_activation_to_embedding']:
                fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
                ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
                ax.plot(embed_preactivation['train'].matrix[:, 0],
                        embed_preactivation['train'].matrix[:, 1],
                        'ok',
                        markersize=2,
                        markeredgewidth=0,
                        alpha=0.5,
                        zorder=0)
                ax.plot(embed_preactivation['valid'].matrix[:, 0],
                        embed_preactivation['valid'].matrix[:, 1],
                        'or',
                        markersize=2,
                        markeredgewidth=0,
                        alpha=1.0,
                        zorder=1)
                ax.tick_params(axis='both',
                               which='major',
                               bottom=False,
                               top=False,
                               labelbottom=False,
                               labeltop=False,
                               left=False,
                               right=False,
                               labelleft=False,
                               labelright=False,
                               pad=4)
                ax.set_frame_on(False)
                fg.savefig(
                    '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    transparent=True,
                    pad_inches=0,
                    dpi=600)
                plt.close()
        # plot heatmap
        else:
            print('plotting embedding heatmap...', flush=True)
            embed['valid'].cluster('all', 'cosine', 'average')
            embed['valid'].heatmap(
                rowmetalabels=[],
                columnmetalabels=[],
                normalize=False,
                standardize=False,
                normalizebeforestandardize=True,
                cmap_name='bwr',
                ub=None,
                lb=None,
                savefilename=
                '{0}/embedding_heatmap_layer{1!s}_finetuning{2!s}.png'.format(
                    d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run']),
                closefigure=True,
                dpi=300)
            if d['current_apply_activation_to_embedding']:
                embed_preactivation['valid'].cluster('all', 'cosine',
                                                     'average')
                embed_preactivation['valid'].heatmap(
                    rowmetalabels=[],
                    columnmetalabels=[],
                    normalize=False,
                    standardize=False,
                    normalizebeforestandardize=True,
                    cmap_name='bwr',
                    ub=None,
                    lb=None,
                    savefilename=
                    '{0}/embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}.png'
                    .format(d['output_path'], d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    closefigure=True,
                    dpi=300)

        # log selected step
        with open('{0}/log.txt'.format(d['output_path']),
                  mode='at',
                  buffering=1) as fl:
            fl.write('\nadjusted selected step:{0}\n'.format(selected_step))

    print('done adjust_early_stopping.', flush=True)
                             list(gene_cell.keys()))
    atb_gene.discard(tobediscarded, 1)
    atb_gene.matrixname += '_filtered_by_{0}_rgep'.format(rgep_name)
    print('rgep_genes: {0!s}'.format(len(gene_cell)), flush=True)
    print(atb_gene)

    # add cell type metadata
    print('adding cell type metadata...', flush=True)
    atb_gene.columnmeta['rgep_cell_type'] = np.array(
        [gene_cell[gene_sym] for gene_sym in atb_gene.columnmeta['symbol']],
        dtype='object')

    # save the data
    print('saving filtered data...', flush=True)
    datasetIO.save_datamatrix(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle'
        .format(rgep_name), atb_gene)
    datasetIO.save_datamatrix(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz'
        .format(rgep_name), atb_gene)
    savefolder = '../../input_data/hugolo_transposed_filtered_by_{0}_rgep'.format(
        rgep_name)
    if not os.path.exists(savefolder):
        os.makedirs(savefolder)
    datasetIO.save_splitdata(savefolder, atb_gene)
    shutil.copyfile(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.pickle'
        .format(rgep_name), '{0}/datamatrix.pickle'.format(savefolder))
    shutil.copyfile(
        '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_filtered_by_{0}_rgep.txt.gz'
        .format(rgep_name), '{0}/datamatrix.txt.gz'.format(savefolder))
def main(adjustments_path):
    
    # read adjustments
    print('reading adjustments...', flush=True)
    designpath_selectedstep = {}
    with open(adjustments_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr:
        for line in fr:
            design_path, selected_step = [x.strip() for x in line.split('\t')]
            designpath_selectedstep[design_path] = int(selected_step)
    print('found {0!s} adjustments...'.format(len(designpath_selectedstep)), flush=True)
    
    # make adjustments
    print('making adjustments...', flush=True)
    for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()):
        print('working on {0}...'.format(design_path), flush=True)
        print('selected step:{0!s}...'.format(selected_step), flush=True)
        
        
        # load design
        print('loading design...', flush=True)
        with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr:
            d = json.load(fr)
        if 'apply_activation_to_embedding' not in d: # for legacy code
            d['apply_activation_to_embedding'] = True
        if 'use_batchnorm' not in d: # for legacy code
            d['use_batchnorm'] = False
        if 'skip_layerwise_training' not in d: # for legacy code
            d['skip_layerwise_training'] = False
        phase = d['training_schedule'][-1]
        d['current_hidden_layer'] = phase['hidden_layer']
        d['current_finetuning_run'] = phase['finetuning_run']
        d['current_epochs'] = phase['epochs']
        
        
        # load data
        if didx == 0:
            print('loading data...', flush=True)
            partitions = ['train', 'valid', 'test']
            dataset = {}
            for partition in partitions:
                if partition == 'train':
                    dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'valid'))
                    dataset[partition].append(datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'test')), 0)
                elif partition == 'valid':
                    dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'train'))
                else:
                    dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], partition))

        
        # finish configuration
        print('finishing configuration...', flush=True)
        
        # specify activation function
        if d['activation_function'] == 'tanh':
            activation_function = {'np':sdae_apply_functions.tanh}
        elif d['activation_function'] == 'relu':
            activation_function = {'np':sdae_apply_functions.relu}
        elif d['activation_function'] == 'elu':
            activation_function = {'np':sdae_apply_functions.elu}
        elif d['activation_function'] == 'sigmoid':
            activation_function = {'np':sdae_apply_functions.sigmoid}
    
        # initialize model architecture (number of layers and dimension of each layer)
        d['current_dimensions'] = d['all_dimensions'][:d['current_hidden_layer']+1] # dimensions of model up to current depth
        
        # specify embedding function for current training phase
        # we want the option of skipping the embedding activation function to apply only to the full model
        if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']:
            d['current_apply_activation_to_embedding'] = False
        else:
            d['current_apply_activation_to_embedding'] = True

        # specify rows and columns of figure showing data reconstructions
        d['reconstruction_rows'] = int(np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]])/2)))
        d['reconstruction_cols'] = 2*d['reconstruction_rows']
        
        
        # move files
        print('moving files...', flush=True)
        if os.path.exists('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)):
            if os.path.exists('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])):
                shutil.move('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']),
                            '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']))
            shutil.copyfile('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step),
                            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']))
        else:
            print('variables do no exist for selected step! skipping...', flush=True)
            continue
        if d['use_batchnorm']:
            if os.path.exists('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)):
                if os.path.exists('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])):
                    shutil.move('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']),
                                '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']))
                shutil.copyfile('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step),
                                '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']))
            else:
                print('batchnorm variables do no exist for selected step! skipping...', flush=True)
                continue
            
        
        # load model variables
        print('loading model variables...', flush=True)
        with open('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr:
            W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode
        if d['use_batchnorm']:
            with open('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr:
                batchnorm_variables = pickle.load(fr) # gammas, betas, moving_means, moving_variances
            batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables(batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output'])
        
        
        # load reporting variables
        print('loading reporting variables...', flush=True)
        if os.path.exists('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])):
            with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr:
                optimization_path = pickle.load(fr)
            reporting_steps = optimization_path['reporting_steps']
            valid_losses = optimization_path['valid_losses']
            train_losses = optimization_path['train_losses']
            valid_noisy_losses = optimization_path['valid_noisy_losses']
            train_noisy_losses = optimization_path['train_noisy_losses']
        else:
            reporting_steps = np.zeros(0, dtype='int32')
            valid_losses = np.zeros(0, dtype='float32')
            train_losses = np.zeros(0, dtype='float32')
            valid_noisy_losses = np.zeros(0, dtype='float32')
            train_noisy_losses = np.zeros(0, dtype='float32')
            with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rt') as fr:
                fr.readline()
                for line in fr:
                    step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [float(x.strip()) for x in line.split('\t')]
                    reporting_steps = np.insert(reporting_steps, reporting_steps.size, step)
                    valid_losses = np.insert(valid_losses, valid_losses.size, valid_loss)
                    train_losses = np.insert(train_losses, train_losses.size, train_loss)
                    valid_noisy_losses = np.insert(valid_noisy_losses, valid_noisy_losses.size, valid_noisy_loss)
                    train_noisy_losses = np.insert(train_noisy_losses, train_noisy_losses.size, train_noisy_loss) 
            with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw:
                pickle.dump({'reporting_steps':reporting_steps, 'valid_losses':valid_losses, 'train_losses':train_losses, 'valid_noisy_losses':valid_noisy_losses, 'train_noisy_losses':train_noisy_losses}, fw)
        
        
        # compute embedding and reconstruction
        print('computing embedding and reconstruction...', flush=True)        
        recon = {}
        embed = {}
        error = {}
        embed_preactivation = {}
        for partition in partitions:
            if d['use_batchnorm']:
                recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables)
                embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables)
            else:
                recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True)
                embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False)
            
            print('{0} reconstruction error: {1:1.3g}'.format(partition, error[partition]), flush=True)
            
            datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition])
            datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition])
            
            if d['current_apply_activation_to_embedding']:
                datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition])
                datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition])
                

        # plot loss
        print('plotting loss...', flush=True)
        fg, ax = plt.subplots(1, 1, figsize=(3.25,2.25))
        ax.set_position([0.55/3.25, 0.45/2.25, 2.6/3.25, 1.7/2.25])
        ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train')
        ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid')
        ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy')
        ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy')
        ax.legend(loc='best', fontsize=8)
        ax.set_ylabel('loss', fontsize=8)
        ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8)
        ax.set_xlim(reporting_steps[0]-1, reporting_steps[-1]+1)
        # ax.set_ylim(0, 1)
        ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8)
        fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600)
        plt.close()


        # plot reconstructions
        print('plotting reconstructions...', flush=True)
        num_recons = min([d['reconstruction_rows']*d['reconstruction_cols'], dataset['valid'].shape[0]])
        x_valid = dataset['valid'].matrix[:num_recons,:]
        xr_valid = recon['valid'].matrix[:num_recons,:]
        if x_valid.shape[1] > 1000:
            x_valid = x_valid[:,:1000]
            xr_valid = xr_valid[:,:1000]
        lb = np.append(x_valid, xr_valid, 1).min(1)
        ub = np.append(x_valid, xr_valid, 1).max(1)
        fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5,3.25))
        for i, ax in enumerate(axs.reshape(-1)):
            if i < num_recons:
                ax.plot(x_valid[i,:], xr_valid[i,:], 'ok', markersize=0.5, markeredgewidth=0)
                ax.set_ylim(lb[i], ub[i])
                ax.set_xlim(lb[i], ub[i])
                ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4)
                ax.set_frame_on(False)
                ax.axvline(lb[i], linewidth=1, color='k')
                ax.axvline(ub[i], linewidth=1, color='k')
                ax.axhline(lb[i], linewidth=1, color='k')
                ax.axhline(ub[i], linewidth=1, color='k')
            else:
                fg.delaxes(ax)
        fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200)
        plt.close()


        # plot 2d embedding
        if d['current_dimensions'][-1] == 2  and (not d['use_finetuning'] or d['current_finetuning_run'] > 0):
            print('plotting 2d embedding...', flush=True)
            fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5))
            ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5])
            ax.plot(embed['train'].matrix[:,0], embed['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0)
            ax.plot(embed['valid'].matrix[:,0], embed['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1)
            ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4)
            ax.set_frame_on(False)
            fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600)
            plt.close()
            
            if d['current_apply_activation_to_embedding']:
                fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5))
                ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5])
                ax.plot(embed_preactivation['train'].matrix[:,0], embed_preactivation['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0)
                ax.plot(embed_preactivation['valid'].matrix[:,0], embed_preactivation['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1)
                ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4)
                ax.set_frame_on(False)
                fg.savefig('{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600)
                plt.close()
        
        
        # log selected step
        with open('{0}/log.txt'.format(d['output_path']), mode='at', buffering=1) as fl:
            fl.write('\nadjusted selected step:{0}\n'.format(selected_step))


    print('done adjust_early_stopping.', flush=True)
from dataclasses import datamatrix as DataMatrix


# load the data
print('loading dataset...', flush=True)
dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle')
print(dataset, flush=True)

# discard samples
print('discarding samples...', flush=True)
dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0)
print(dataset, flush=True)

# save the data
print('saving data...', flush=True)
datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', dataset)
datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', dataset)
savefolder = '../../input_data/pratfelip_transposed_plus_clinical_no_stabledisease'
if not os.path.exists(savefolder):
	os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, dataset)
shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))

# load the data
print('loading dataset...', flush=True)
dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_ft_pratfelip_only_clinical_and_deconv.pickle')
print(dataset, flush=True)

# discard samples
print('discarding samples...', flush=True)
Beispiel #20
0
import shutil
from matplotlib import pyplot as plt

# load the data
gene_atb = datasetIO.load_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle'
)

# scale counts
gene_atb.matrix = np.exp(
    np.log(gene_atb.matrix) -
    np.log(gene_atb.columnmeta['auc'].reshape(1, -1)) +
    (np.log(4) + 7 * np.log(10)))
gene_atb.matrixname += '_scaledcounts'
datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.pickle',
    gene_atb)
datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.txt.gz',
    gene_atb)

# shuffle the data
gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0)
gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1)
print(gene_atb)

# strip version from ensembl_gene_ids
gene_atb.rowlabels = np.array(
    [x.rsplit('.', maxsplit=1)[0] for x in gene_atb.rowlabels], dtype='object')

# add hgnc metadata
Beispiel #21
0
def main(dictionaries, year, datestamp, min_score, universe, n_prior,
         min_count):

    print('begin calc_term-term_stats_from_termite.py')

    print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1]))
    print('year: {0}'.format(year))
    print('datestamp: {0}'.format(datestamp))
    print('min_score: {0!s}'.format(min_score))
    print('universe: {0}'.format(universe))
    print('n_prior: {0!s}'.format(n_prior))
    print('min_count: {0!s}'.format(min_count))

    # load counts datamatrix
    # this file is generated by count_term-term_pmids_from_termite.py
    print('loading counts datamatrix...')
    row_dictionary = dictionaries[
        0]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    column_dictionary = dictionaries[
        1]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score)
    term_term = datasetIO.load_datamatrix(counts_datamatrix_path)
    print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path))
    print(term_term)

    # find term-term pairs with sufficient counts
    print('finding term-term pairs with sufficient counts...')
    I, J = (term_term.matrix >= min_count).nonzero()
    num_sufficient = I.size
    print('term-term pairs with at least {0!s} counts: {1!s}'.format(
        min_count, num_sufficient))

    # convert counts to float
    print('converting counts to float...')
    term_term.matrix = np.float64(term_term.matrix)
    term_term.updatedtypeattribute()
    for field, values in term_term.rowmeta.items():
        if values.dtype == np.int64:
            term_term.rowmeta[field] = np.float64(values)
    for field, values in term_term.columnmeta.items():
        if values.dtype == np.int64:
            term_term.columnmeta[field] = np.float64(values)

    # set universe size
    print('setting universe size...')
    if universe == 'intersectionunion' or universe == 'union':
        universe_size = term_term.rowmeta['all_count_{0}'.format(universe)][0]
    elif universe == 'medline':
        universe_size = 1e8  # 3e7
        term_term.rowmeta['term_count_medline'] = term_term.rowmeta[
            'term_count_union'].copy()
        term_term.columnmeta['term_count_medline'] = term_term.columnmeta[
            'term_count_union'].copy()
    elif universe == 'infinity':
        universe_size = 1e16
        term_term.rowmeta['term_count_infinity'] = term_term.rowmeta[
            'term_count_union'].copy()
        term_term.columnmeta['term_count_infinity'] = term_term.columnmeta[
            'term_count_union'].copy()
    else:
        raise ValueError('invalid universe')

    # create matrices for select association statistics
    print('creating matrices for select association statistics...')
    selstats = ['mcc', 'mmcc', 'cos', 'mi', 'nmi', 'iqr']
    statmats = {}
    for selstat in selstats:
        statmats[selstat] = np.zeros(term_term.shape, dtype='float64')

    # calculate association statistics and write to dataframe
    print('calculating association statistics and writing to dataframe...')
    dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score,
        universe, n_prior, min_count)
    rowmetalabels = ['term_id', 'term_name']
    rowmetaheaders = [
        '{0}_id'.format(row_dictionary), '{0}_name'.format(row_dictionary)
    ]
    columnmetalabels = ['term_id', 'term_name']
    columnmetaheaders = [
        '{0}_id'.format(column_dictionary),
        '{0}_name'.format(column_dictionary)
    ]
    statheaders = [
        'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr',
        'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr',
        'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'mmcc', 'cos',
        'fnlp', 'sig', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95',
        'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95',
        'dor_ub95', 'mi', 'nmi', 'iqr'
    ]
    with gzip.open(dataframe_path,
                   mode='wt',
                   encoding='utf-8',
                   errors='surrogateescape') as fw:
        writelist = ['{0}_dictidname'.format(row_dictionary)
                     ] + rowmetaheaders + [
                         '{0}_dictidname'.format(column_dictionary)
                     ] + columnmetaheaders + statheaders
        fw.write('\t'.join(writelist) + '\n')
        for k, (i, j) in enumerate(zip(I, J)):
            if np.mod(k, 1000) == 0 or k + 1 == num_sufficient:
                print('working on term-term pair {0!s} of {1!s}...'.format(
                    k + 1, num_sufficient))

            # confusion matrix
            tp = term_term.matrix[i, j]
            fp = term_term.rowmeta['term_count_{0}'.format(universe)][i] - tp
            fn = term_term.columnmeta['term_count_{0}'.format(
                universe)][j] - tp
            tn = universe_size - (tp + fp + fn)

            # incorporate a random prior with effective sample size = n_prior,
            # where prior distribution conforms to empirical marginal distributions
            Rr = (tp + fp) / (fn + tn)  # ratio of rows of confusion matrix
            Rc = (tp + fn) / (fp + tn)  # ratio of columns of confusion matrix
            tp_prior = n_prior * Rc * Rr / (
                Rc * Rr + Rr + Rc + 1
            )  # solve for tp given constraints tp/fn=Rr, fp/tn=Rr, tp/fp=Rc, fn/tn=Rc, tp+fp+fn+tn=n_eff
            fp_prior = tp_prior / Rc
            fn_prior = tp_prior / Rr
            tn_prior = tp_prior / Rc / Rr
            tp += tp_prior
            fp += fp_prior
            fn += fn_prior
            tn += tn_prior

            ap = tp + fn
            an = fp + tn
            pp = tp + fp
            pn = tn + fn
            n = tn + fp + fn + tp

            tpr = tp / ap  # sensitivity, recall
            fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
            tnr = tn / an  # specificity
            fpr = fp / an  # 1-tnr, 1-specificity

            ppv = tp / pp  # precision
            fdr = fp / pp  # 1-ppv, 1-precision
            npv = tn / pn
            fomr = fn / pn  # 1-npv

            acc = (tp + tn) / n
            mcr = (fp + fn) / n  # 1-acc
            prev = ap / n

            plr = (tp / fp) / (
                ap / an
            )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
            nlr = (fn / tn) / (
                ap / an
            )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
            dor = (tp / fp) / (
                fn / tn
            )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
            drr = (tp / pp) / (
                fn / pn
            )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
            darr = (tp / pp) - (
                fn / pn
            )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
            mrr = (tp / pp) / (
                ap / n
            )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
            marr = (tp / pp) - (
                ap / n
            )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

            f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr)
            mcc = (tp * tn - fp * fn) / np.sqrt(
                (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
            mmcc = 1 - np.sqrt(
                (fp * fn) / ((tp + fp) * (tp + fn))
            )  # modified (by me), equivalent to 1 + mcc with tn forced to 0
            cos = tp / np.sqrt((tp + fp) * (tp + fn))  # ochiai
            fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)
            sig = fnlp > np.log10(term_term.size) - np.log10(0.05)

            lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(fn) + np.log10(
                fn + tn)  # log10 of relative risk
            lrr_se = np.sqrt(
                fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log(
                    10)  # standard error of log10 of relative risk
            lrr_lb95 = lrr - 1.96 * lrr_se
            lrr_ub95 = lrr + 1.96 * lrr_se
            drr_lb95 = 10**lrr_lb95
            drr_ub95 = 10**lrr_ub95

            lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10(
                tn)  # log10 of odds ratio
            lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log(
                10)  # standard error of log10 of odds ratio
            lor_lb95 = lor - 1.96 * lor_se
            lor_ub95 = lor + 1.96 * lor_se
            dor_lb95 = 10**lor_lb95
            dor_ub95 = 10**lor_ub95

            mi, nmi, iqr = mutualinformation(
                tp, fp, fn, tn
            )  # mutual information, normalized mutual information, information quality ratio

            count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n]
            other_stats = [
                tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr,
                nlr, dor, drr, darr, mrr, marr, f1, mcc, mmcc, cos, fnlp, sig,
                lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor,
                lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr
            ]

            rowwritelist = [term_term.rowlabels[i]] + [
                term_term.rowmeta[l][i] if term_term.rowmeta[l].dtype
                == 'object' else str(term_term.rowmeta[l][i])
                for l in rowmetalabels
            ]
            columnwritelist = [term_term.columnlabels[j]] + [
                term_term.columnmeta[l][j] if term_term.columnmeta[l].dtype
                == 'object' else str(term_term.columnmeta[l][j])
                for l in columnmetalabels
            ]
            writelist = rowwritelist + columnwritelist + [
                str(s) for s in count_stats
            ] + ['{0:1.5g}'.format(s) for s in other_stats]
            fw.write('\t'.join(writelist) + '\n')

            statmats['mcc'][i, j] = mcc
            statmats['mmcc'][i, j] = mmcc
            statmats['cos'][i, j] = cos
            statmats['mi'][i, j] = mi
            statmats['nmi'][i, j] = nmi
            statmats['iqr'][i, j] = iqr

    # save matrices for select association statistics
    print('saving matrices for select association statistics...')
    for selstat in selstats:
        term_term.matrix = statmats[selstat]
        datasetIO.save_datamatrix(
            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.txt.gz'
            .format(row_dictionary, column_dictionary, selstat, year,
                    datestamp, min_score, universe, n_prior, min_count),
            term_term)
        datasetIO.save_datamatrix(
            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'
            .format(row_dictionary, column_dictionary, selstat, year,
                    datestamp, min_score, universe, n_prior, min_count),
            term_term)

    print('done calc_term-term_stats_from_termite.py')
Beispiel #22
0
def main(study_name='your_study'):
    
    # load the data
    orientation = 'fat'
    partitions = ['train', 'valid', 'test']
    
    dataset = {}
    for partition in partitions:
        dataset[partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}.pickle'.format(orientation, partition))
        if 'all' not in dataset:
            dataset['all'] = copy.deepcopy(dataset[partition])
        else:
            dataset['all'].append(dataset[partition], 0)
    
    dataset[study_name] = {}
    for partition in partitions:
        dataset[study_name][partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}/{2}.pickle'.format(study_name, orientation, partition))
        if 'all' not in dataset[study_name]:
            dataset[study_name]['all'] = copy.deepcopy(dataset[study_name][partition])
        else:
            dataset[study_name]['all'].append(dataset[study_name][partition], 0)
    
    partitions.append('all')
    
    
    # create output directories
    if not os.path.exists('results'):
        os.mkdir('results')
    if not os.path.exists('results/sdae_features'):
        os.mkdir('results/sdae_features')
    if not os.path.exists('results/sdae_features/{0}'.format(study_name)):
        os.mkdir('results/sdae_features/{0}'.format(study_name))
    if not os.path.exists('results/sdae_features/{0}/{1}'.format(study_name, orientation)):
        os.mkdir('results/sdae_features/{0}/{1}'.format(study_name, orientation))
    
    
    # load the model
    activation_function, activation_function_name = (relu, 'relu')
    with open('results/autoencoder/fat/ns5_last2_first0.05_5layers_relu_variables.pickle', 'rb') as fr:
        W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode
    
    
    # get embeddings and reconstructions
    sdae = {}
    for partition in partitions:
        sdae[partition] = {}
        sdae[partition]['recon'], sdae[partition]['embed'], sdae[partition]['error'] = sdae_reconstruction(dataset[partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True)
        print('{0} error: {1:1.3g}'.format(partition, sdae[partition]['error']))
    
    sdae[study_name] = {}
    for partition in partitions:
        sdae[study_name][partition] = {}
        sdae[study_name][partition]['recon'], sdae[study_name][partition]['embed'], sdae[study_name][partition]['error'] = sdae_reconstruction(dataset[study_name][partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True)
        print('{0} {1} error: {2:1.3g}'.format(study_name, partition, sdae[study_name][partition]['error']))
    
    
    # visualize embedding
    if sdae['all']['embed'].shape[1] < 5:
        for nx in range(sdae['all']['embed'].shape[1]-1):
            for ny in range(nx+1, sdae['all']['embed'].shape[1]):
                
                #tissues = np.unique(dataset['all'].rowmeta['general_tissue'])
                tissues = ['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel', 'Brain',
                           'Breast', 'Colon', 'Esophagus', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle',
                           'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Salivary Gland', 'Skin',
                           'Small Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'V****a']
                tissue_abbrevs = ['AT', 'AG', 'B', 'BV', 'Bn',
                                  'Bt', 'C', 'E', 'H', 'K', 'Lr', 'Lg', 'M',
                                  'N', 'O', 'Ps', 'Py', 'Pe', 'SG', 'Sk',
                                  'SI', 'Sp', 'St', 'Ts', 'Td', 'U', 'V']
                cmap = plt.get_cmap('gist_rainbow')
                colors = [cmap(float((i+0.5)/len(tissues))) for i in range(len(tissues))]
                
                fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3))
                ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3])
                for tissue, tissue_abbrev, color in zip(tissues, tissue_abbrevs, colors):
                    if tissue == '-666':
                        continue
                #        zorder = 0
                #        alpha = 0.05
                #        color = 'k'
                    else:
                        zorder = 1
                        alpha = 0.5
                    hit = dataset['all'].rowmeta['general_tissue'] == tissue
                    hidxs = hit.nonzero()[0]
                #    ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue))
                    ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=0.2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue))
                    for hidx in hidxs:
                        ax.text(sdae['all']['embed'].matrix[hidx,nx], sdae['all']['embed'].matrix[hidx,ny], tissue_abbrev, horizontalalignment='center', verticalalignment='center', fontsize=4, color=color, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue))
                ax.plot(sdae[study_name]['all']['embed'].matrix[:,nx], sdae[study_name]['all']['embed'].matrix[:,ny], linestyle='None', linewidth=0, marker='x', markerfacecolor='k', markeredgecolor='k', markersize=0.2, markeredgewidth=0, alpha=1, zorder=1, label=study_name)
                for hidx in range(sdae[study_name]['all']['embed'].shape[0]):
                    ax.text(sdae[study_name]['all']['embed'].matrix[hidx,nx], sdae[study_name]['all']['embed'].matrix[hidx,ny], 'X', horizontalalignment='center', verticalalignment='center', fontsize=4, color='k', alpha=1, zorder=1, label=study_name)
                ax.set_xlim(sdae['all']['embed'].matrix[:,nx].min(), sdae['all']['embed'].matrix[:,nx].max())
                ax.set_ylim(sdae['all']['embed'].matrix[:,ny].min(), sdae['all']['embed'].matrix[:,ny].max())
                ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=40, fontsize=8, labelspacing=0.25)
                ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4)
                ax.set_frame_on(False)
                fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600)
                ax.set_xlim(sdae[study_name]['all']['embed'].matrix[:,nx].min(), sdae[study_name]['all']['embed'].matrix[:,nx].max())
                ax.set_ylim(sdae[study_name]['all']['embed'].matrix[:,ny].min(), sdae[study_name]['all']['embed'].matrix[:,ny].max())
                fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}_zoom.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600)
                plt.close()
    
    
    # save embedding
    datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed'])
    datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed'])
    datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon'])
    datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon'])
Beispiel #23
0
def main(datamatrix_path, test_index, response_variable_name, valid_index,
         valid_fraction, feature_fraction, regularization_type,
         inverse_regularization_strength, intercept_scaling,
         pos_neg_weight_ratio, evaluation_statistic, save_weights, save_folder,
         datamatrix):

    print('loading datamatrix...', flush=False)
    if datamatrix == None or type(datamatrix) == str:
        dm = datasetIO.load_datamatrix(datamatrix_path)
    else:
        dm = datamatrix

    print('setting random seed with test_index {0!s}...'.format(test_index),
          flush=False)
    np.random.seed(test_index)

    print('getting bootstrap sample...', flush=False)
    all_indices = np.arange(dm.shape[0])
    boot_indices = np.random.choice(dm.shape[0], dm.shape[0], replace=True)
    test_indices = all_indices[~np.in1d(all_indices, boot_indices)]

    print('reserving out-of-bag samples as test set...', flush=False)
    Y = {
        'test': dm.rowmeta[response_variable_name][test_indices].astype('bool')
    }
    X = {'test': dm.matrix[test_indices, :]}

    print('setting random seed with valid_index {0!s}...'.format(valid_index),
          flush=False)
    np.random.seed(valid_index)

    print('splitting bootstrap sample into training and validation sets...',
          flush=False)
    if type(valid_fraction) == str and (valid_fraction.lower() == 'loo'
                                        or valid_fraction.lower() == 'loocv'):
        valid_fraction = 'loo'
        valid_indices = all_indices
        train_indices = all_indices
    else:
        valid_indices = np.random.choice(dm.shape[0],
                                         round(valid_fraction * dm.shape[0]),
                                         replace=False)
        train_indices = all_indices[~np.in1d(all_indices, valid_indices)]

    Y['train'] = dm.rowmeta[response_variable_name][boot_indices][
        train_indices].astype('bool')
    Y['valid'] = dm.rowmeta[response_variable_name][boot_indices][
        valid_indices].astype('bool')
    X['train'] = dm.matrix[boot_indices, :][train_indices, :]
    X['valid'] = dm.matrix[boot_indices, :][valid_indices, :]

    print('fitting and evaluating models...', flush=False)
    stages = ['validation', 'testing']
    data_subsets = ['fit', 'predict']
    performance_stats = [
        'auroc', 'auprc', 'brier', 'nll', 'tp', 'fn', 'tn', 'fp', 'ap', 'an',
        'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv',
        'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr',
        'mrr', 'marr', 'mcc', 'fnlp', 'f1', 'f1_100', 'f1_50', 'f1_25',
        'f1_10', 'f1_5', 'f1_3', 'f1_2', 'f2', 'f3', 'f5', 'f10', 'f25', 'f50',
        'f100'
    ]
    if valid_fraction == 'loo':
        X.update({
            'validation': {
                'fit': X['train'],
                'predict': X['valid']
            },
            'testing': {
                'fit': X['train'],
                'predict': X['test']
            }
        })
        Y.update({
            'validation': {
                'fit': Y['train'],
                'predict': Y['valid']
            },
            'testing': {
                'fit': Y['train'],
                'predict': Y['test']
            }
        })
    else:
        X.update({
            'validation': {
                'fit': X['train'],
                'predict': X['valid']
            },
            'testing': {
                'fit': np.append(X['train'], X['valid'], 0),
                'predict': X['test']
            }
        })
        Y.update({
            'validation': {
                'fit': Y['train'],
                'predict': Y['valid']
            },
            'testing': {
                'fit': np.append(Y['train'], Y['valid']),
                'predict': Y['test']
            }
        })
    stat_subset = {}
    for stage in stages:
        print('working on {0} stage...'.format(stage), flush=False)

        if feature_fraction < 1:
            print('performing univariate feature selection...', flush=False)
            num_features = round(feature_fraction * dm.shape[1])
            test_stats, p_values = ttest_ind(
                X[stage]['fit'][Y[stage]['fit'], :],
                X[stage]['fit'][~Y[stage]['fit'], :],
                axis=0,
                equal_var=False,
                nan_policy='propagate')
            ranks = np.argsort(p_values)
            selected_indices = ranks[:num_features]
            selected_features = dm.columnlabels[selected_indices]
            if stage == 'testing':
                print('plotting univariate test statistics...', flush=False)
                plt.figure()
                plt.hist(test_stats, 50)
                plt.savefig(
                    '{0}/univariate_test_statistics.png'.format(save_folder),
                    transparent=True,
                    pad_inches=0,
                    dpi=100)
                plt.figure()
                plt.hist(p_values, 50)
                plt.savefig('{0}/univariate_pvalues.png'.format(save_folder),
                            transparent=True,
                            pad_inches=0,
                            dpi=100)
                plt.figure()
                plt.hist(-np.log10(p_values), 50)
                plt.savefig('{0}/univariate_nlps.png'.format(save_folder),
                            transparent=True,
                            pad_inches=0,
                            dpi=100)
        else:
            print('skipping univariate feature selection...', flush=False)
            selected_indices = np.arange(dm.shape[1], dtype='int64')
            selected_features = dm.columnlabels.copy()
        print('selected {0!s} features...'.format(selected_features.size),
              flush=False)

        print('calculating class weights...', flush=False)
        pos_weight = np.sqrt(pos_neg_weight_ratio) * (
            (Y[stage]['fit'].size) / 2 / (Y[stage]['fit'].sum())
        )  # (assign weight to class)*(adjust for unbalanced classes)
        neg_weight = (1 / pos_weight) * (
            (Y[stage]['fit'].size) / 2 / ((~Y[stage]['fit']).sum())
        )  # (assign weight to class)*(adjust for unbalanced classes)
        class_weight = {True: pos_weight, False: neg_weight}

        print('fitting model...', flush=False)
        logistic_regression_model = LogisticRegression(
            penalty=regularization_type,
            C=inverse_regularization_strength,
            intercept_scaling=intercept_scaling,
            class_weight=class_weight).fit(
                X[stage]['fit'][:, selected_indices], Y[stage]['fit'])

        if stage == 'testing':
            print('plotting feature weights...', flush=False)
            iter_feature = DataMatrix(
                rowname='iteration',
                rowlabels=np.array(
                    ['test{0!s}_valid{1!s}'.format(test_index, valid_index)],
                    dtype='object'),
                rowmeta={
                    'intercept': logistic_regression_model.intercept_,
                    'test_index': np.array([test_index], dtype='int64'),
                    'valid_index': np.array([valid_index], dtype='int64')
                },
                columnname=dm.columnname,
                columnlabels=dm.columnlabels.copy(),
                columnmeta=copy.deepcopy(dm.columnmeta),
                matrixname='feature_weights',
                matrix=np.zeros((1, dm.shape[1]), dtype='float64'))
            feature_idx = {f: i for i, f in enumerate(dm.columnlabels)}
            for feature, weight in zip(selected_features,
                                       logistic_regression_model.coef_[0, :]):
                iter_feature.matrix[0, feature_idx[feature]] = weight
            plt.figure()
            plt.hist(iter_feature.matrix[0, :], 50)
            plt.savefig('{0}/feature_weights.png'.format(save_folder),
                        transparent=True,
                        pad_inches=0,
                        dpi=100)
            if feature_fraction < 1:
                plt.figure()
                plt.hist(iter_feature.matrix[0, selected_indices], 50)
                plt.savefig(
                    '{0}/feature_weights_selected.png'.format(save_folder),
                    transparent=True,
                    pad_inches=0,
                    dpi=100)

            if save_weights:
                print('saving feature weights...', flush=False)
                datasetIO.save_datamatrix(
                    '{0}/iter_feature_datamatrix.txt.gz'.format(save_folder),
                    iter_feature)

        print('creating datamatrix for performance statistics...', flush=False)
        stat_subset[stage] = DataMatrix(
            rowname='performance_statistic',
            rowlabels=np.array(performance_stats, dtype='object'),
            rowmeta={},
            columnname='data_subset',
            columnlabels=np.array(data_subsets, dtype='object'),
            columnmeta={},
            matrixname='classifier_performance_on_data_subsets',
            matrix=np.zeros((len(performance_stats), len(data_subsets)),
                            dtype='float64'))

        for j, subset in enumerate(stat_subset[stage].columnlabels):
            print('evaluating performance on {0} subset...'.format(subset),
                  flush=False)
            if valid_fraction == 'loo' and stage == 'validation' and subset == 'predict':
                P_pred = np.zeros(X[stage][subset].shape[0], dtype='float64')
                for train_index, test_index in LeaveOneOut().split(
                        X[stage][subset]):
                    logistic_regression_model = LogisticRegression(
                        penalty=regularization_type,
                        C=inverse_regularization_strength,
                        intercept_scaling=intercept_scaling,
                        class_weight=class_weight).fit(
                            X[stage]['fit'][train_index, :][:,
                                                            selected_indices],
                            Y[stage]['fit'][train_index])
                    P_pred[
                        test_index] = logistic_regression_model.predict_proba(
                            X[stage][subset][test_index, :][:,
                                                            selected_indices]
                        )[:, logistic_regression_model.classes_ == 1][0][0]
            else:
                P_pred = logistic_regression_model.predict_proba(
                    X[stage][subset][:, selected_indices]
                )[:, logistic_regression_model.classes_ == 1]
            Y_pred = P_pred > 0.5

            auroc = roc_auc_score(Y[stage][subset], P_pred)
            auprc = average_precision_score(Y[stage][subset], P_pred)
            brier = brier_score_loss(Y[stage][subset], P_pred)
            nll = log_loss(Y[stage][subset], P_pred)

            tn, fp, fn, tp = confusion_matrix(Y[stage][subset], Y_pred).ravel()

            # incorporate a prior with effective sample size = n_eff, where prior represents random predictions
            n_eff = 1
            prevalence = (tp + fn) / (tn + fp + fn + tp)
            tp += n_eff * prevalence / 2
            fn += n_eff * prevalence / 2
            tn += n_eff * (1 - prevalence) / 2
            fp += n_eff * (1 - prevalence) / 2

            ap = tp + fn
            an = fp + tn
            pp = tp + fp
            pn = tn + fn
            n = tn + fp + fn + tp

            tpr = tp / ap  # sensitivity, recall
            fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
            tnr = tn / an  # specificity
            fpr = fp / an  # 1-tnr, 1-specificity

            ppv = tp / pp  # precision
            fdr = fp / pp  # 1-ppv, 1-precision
            npv = tn / pn
            fomr = fn / pn  # 1-npv

            acc = (tp + tn) / n
            mcr = (fp + fn) / n  # 1-acc
            prev = ap / n

            plr = (tp / fp) / (
                ap / an
            )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
            nlr = (fn / tn) / (
                ap / an
            )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
            dor = (tp / fp) / (
                fn / tn
            )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
            drr = (tp / pp) / (
                fn / pn
            )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
            darr = (tp / pp) - (
                fn / pn
            )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
            mrr = (tp / pp) / (
                ap / n
            )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
            marr = (tp / pp) - (
                ap / n
            )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

            mcc = (tp * tn - fp * fn) / np.sqrt(
                (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
            fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)

            precision = ppv
            recall = tpr
            f1 = (1 +
                  (1**2)) * precision * recall / ((1**2) * precision + recall)
            f1_100 = (1 + (1 / 100**2)) * precision * recall / (
                (1 / 100**2) * precision + recall)
            f1_50 = (1 + (1 / 50**2)) * precision * recall / (
                (1 / 50**2) * precision + recall)
            f1_25 = (1 + (1 / 25**2)) * precision * recall / (
                (1 / 25**2) * precision + recall)
            f1_10 = (1 + (1 / 10**2)) * precision * recall / (
                (1 / 10**2) * precision + recall)
            f1_5 = (1 + (1 / 5**2)) * precision * recall / (
                (1 / 5**2) * precision + recall)
            f1_3 = (1 + (1 / 3**2)) * precision * recall / (
                (1 / 3**2) * precision + recall)
            f1_2 = (1 + (1 / 2**2)) * precision * recall / (
                (1 / 2**2) * precision + recall)
            f2 = (1 +
                  (2**2)) * precision * recall / ((2**2) * precision + recall)
            f3 = (1 +
                  (3**2)) * precision * recall / ((3**2) * precision + recall)
            f5 = (1 +
                  (5**2)) * precision * recall / ((5**2) * precision + recall)
            f10 = (1 + (10**2)) * precision * recall / (
                (10**2) * precision + recall)
            f25 = (1 + (25**2)) * precision * recall / (
                (25**2) * precision + recall)
            f50 = (1 + (50**2)) * precision * recall / (
                (50**2) * precision + recall)
            f100 = (1 + (100**2)) * precision * recall / (
                (100**2) * precision + recall)

            stat_subset[stage].matrix[:, j] = [
                auroc, auprc, brier, nll, tp, fn, tn, fp, ap, an, pp, pn, n,
                tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr,
                nlr, dor, drr, darr, mrr, marr, mcc, fnlp, f1, f1_100, f1_50,
                f1_25, f1_10, f1_5, f1_3, f1_2, f2, f3, f5, f10, f25, f50, f100
            ]

        print('saving performance statistics...', flush=False)
        datasetIO.save_datamatrix(
            '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(save_folder, stage),
            stat_subset[stage])

        print('printing performance statistics...', flush=False)
        print('\t'.join(['stage', stat_subset[stage].rowname] +
                        stat_subset[stage].columnlabels.tolist()),
              flush=False)
        for stat, vals in zip(stat_subset[stage].rowlabels,
                              stat_subset[stage].matrix):
            print('\t'.join([stage, stat] +
                            ['{0:1.3g}'.format(v) for v in vals]),
                  flush=False)

    print('saving evaluation statistic...', flush=False)
    objective = stat_subset['validation'].select(evaluation_statistic,
                                                 'predict')
    with open('{0}/output.json'.format(save_folder),
              mode='wt',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        json.dump(objective, fw, indent=2)

    print('done logistic_regression.py', flush=False)
def main(study_name='your_study'):

    # load your data and create datamatrix object
    with open('data/original_data/{0}/ensembl_gene_ids.txt'.format(study_name),
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        ensembl_gene_ids = np.array([x.strip() for x in fr.read().split('\n')],
                                    dtype='object')

    with open('data/original_data/{0}/sample_ids.txt'.format(study_name),
              mode='rt',
              encoding='utf-8',
              errors='surrogateescape') as fr:
        sample_ids = np.array([x.strip() for x in fr.read().split('\n')],
                              dtype='object')

    counts_matrix = np.loadtxt(
        'data/original_data/{0}/expression_matrix.txt.gz'.format(study_name),
        dtype='float64',
        delimiter='\t',
        ndmin=2)
    total_counts_per_sample = counts_matrix.sum(0)

    gene_sample = dataclasses.datamatrix(
        rowname='ensembl_gene_id',
        rowlabels=ensembl_gene_ids,
        rowmeta={},
        columnname='sample_id',
        columnlabels=sample_ids,
        columnmeta={'total_counts': total_counts_per_sample},
        matrixname='rnaseq_gene_counts_from_{0}'.format(study_name),
        matrix=counts_matrix)
    del ensembl_gene_ids, sample_ids, counts_matrix, total_counts_per_sample

    # scale counts
    gene_sample.matrix = np.exp(
        np.log(gene_sample.matrix) -
        np.log(gene_sample.columnmeta['total_counts'].reshape(1, -1)) +
        (np.log(4) + 7 * np.log(10)))
    gene_sample.matrixname = 'rnaseq_scaled_counts_from_{0}'.format(study_name)

    # shuffle the data
    gene_sample.reorder(np.random.permutation(gene_sample.shape[0]), 0)
    gene_sample.reorder(np.random.permutation(gene_sample.shape[1]), 1)
    print(gene_sample)

    # load the reference data
    gene_sample_ref = datasetIO.load_datamatrix(
        'data/prepared_data/fat/train.pickle').totranspose()
    print(gene_sample_ref)

    # align genes
    tobediscarded = ~np.in1d(gene_sample.rowlabels,
                             gene_sample_ref.rowmeta['ensembl_gene_id'])
    gene_sample.discard(tobediscarded, 0)
    missing_ensembl_ids = gene_sample_ref.rowmeta['ensembl_gene_id'][~np.in1d(
        gene_sample_ref.rowmeta['ensembl_gene_id'], gene_sample.rowlabels)]
    gene_sample = gene_sample.tolabels(
        rowlabels=gene_sample_ref.rowmeta['ensembl_gene_id'].copy(),
        columnlabels=[])
    gene_sample.rowlabels = gene_sample_ref.rowlabels.copy()
    gene_sample.rowname = gene_sample_ref.rowname
    for k, v in gene_sample_ref.rowmeta.items():
        gene_sample.rowmeta[k] = v.copy()
    gene_sample.rowmeta['is_missing'] = np.in1d(
        gene_sample.rowmeta['ensembl_gene_id'], missing_ensembl_ids)
    gene_sample.rowmeta['all_zero'] = (gene_sample.matrix == 0).all(1)
    print('missing data for {0!s} genes'.format(
        gene_sample.rowmeta['is_missing'].sum()))
    print('no counts for {0!s} genes'.format(
        gene_sample.rowmeta['all_zero'].sum()))
    print(gene_sample)

    # handle zeros
    nonzeromins = np.zeros(gene_sample.shape[1], dtype='float64')
    for j in range(gene_sample.shape[1]):
        nonzeromins[j] = gene_sample.matrix[gene_sample.matrix[:, j] > 0,
                                            j].min()
        gene_sample.matrix[gene_sample.matrix[:, j] == 0,
                           j] = nonzeromins[j] / 2.0

    # distributions


#    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
#    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

# log2
    gene_sample.matrix = np.log2(gene_sample.matrix)

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

    # normalize samples
    median_shift_from_median = np.median(
        gene_sample.matrix -
        gene_sample.rowmeta['median_sample_ref'].reshape(-1, 1), 0)
    gene_sample.matrix -= median_shift_from_median.reshape(1, -1)

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10)

    # standardize the data
    gene_sample.matrix = (
        gene_sample.matrix - gene_sample.rowmeta['row_mean_ref'].reshape(
            -1, 1)) / gene_sample.rowmeta['row_stdv_ref'].reshape(-1, 1)

    # handle missing genes
    gene_sample.matrix[gene_sample.rowmeta['is_missing'], :] = 0
    #    gene_sample.matrix[gene_sample.rowmeta['is_missing'],:] = gene_sample_ref.matrix[gene_sample.rowmeta['is_missing'],:].min(1, keepdims=True)/2.0

    # distributions
    #    plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50)
    #    plt.figure(); plt.hist(gene_sample.matrix[:5,:].T, 10)
    #    plt.figure(); plt.hist(gene_sample.matrix.reshape(-1), 1000)

    # transpose the data
    atb_gene = gene_sample.totranspose()

    # split the data
    test_fraction = 0.1
    tobepopped = np.random.permutation(gene_sample.shape[0]) < round(
        max([test_fraction * gene_sample.shape[0], 2.0]))
    gene_sample_test = gene_sample.pop(tobepopped, 0)
    valid_fraction = 0.1
    tobepopped = np.random.permutation(gene_sample.shape[0]) < round(
        max([valid_fraction * gene_sample.shape[0], 2.0]))
    gene_sample_valid = gene_sample.pop(tobepopped, 0)
    gene_sample_train = gene_sample
    del gene_sample, tobepopped

    # save the data
    if not os.path.exists('data/prepared_data'):
        os.mkdir('data/prepared_data')
    if not os.path.exists('data/prepared_data/{0}'.format(study_name)):
        os.mkdir('data/prepared_data/{0}'.format(study_name))
    if not os.path.exists('data/prepared_data/{0}/skinny'.format(study_name)):
        os.mkdir('data/prepared_data/{0}/skinny'.format(study_name))
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/test.pickle'.format(study_name),
        gene_sample_test)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/valid.pickle'.format(study_name),
        gene_sample_valid)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/skinny/train.pickle'.format(study_name),
        gene_sample_train)
    del gene_sample_test, gene_sample_valid, gene_sample_train

    # split the data
    test_fraction = 0.1
    tobepopped = np.random.permutation(atb_gene.shape[0]) < round(
        max([test_fraction * atb_gene.shape[0], 2.0]))
    atb_gene_test = atb_gene.pop(tobepopped, 0)
    valid_fraction = 0.1
    tobepopped = np.random.permutation(atb_gene.shape[0]) < round(
        max([valid_fraction * atb_gene.shape[0], 2.0]))
    atb_gene_valid = atb_gene.pop(tobepopped, 0)
    atb_gene_train = atb_gene
    del atb_gene, tobepopped

    # save the data
    if not os.path.exists('data/prepared_data'):
        os.mkdir('data/prepared_data')
    if not os.path.exists('data/prepared_data/{0}'.format(study_name)):
        os.mkdir('data/prepared_data/{0}'.format(study_name))
    if not os.path.exists('data/prepared_data/{0}/fat'.format(study_name)):
        os.mkdir('data/prepared_data/{0}/fat'.format(study_name))
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/test.pickle'.format(study_name),
        atb_gene_test)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/valid.pickle'.format(study_name),
        atb_gene_valid)
    datasetIO.save_datamatrix(
        'data/prepared_data/{0}/fat/train.pickle'.format(study_name),
        atb_gene_train)
Beispiel #25
0
print('values are p-values with non-significant associations (pvalue > 1e-4) imputed with pvalue=1', flush=True)
gene_atb = datasetIO.load_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed.csv.gz', delimiter=',', getmetadata=False) # (3455, 295)
gene_atb.rowname = 'mgd_id'
gene_atb.columnname = 'mp_id'
gene_atb.matrixname = 'gene_phenotype_associations_from_impc'

# threshold the data
print('thresholding data...', flush=True)
print('because significant associations have p-value 1e-4 or less, perhaps relative p-values are not informative and better to threshold', flush=True)
gene_atb.matrix = np.float64(gene_atb.matrix < 1)
gene_atb.matrixname += '_thresholded'
print('matrix sparsity: {0!s}, row median sparsity: {1!s}, column median sparsity: {2!s}'.format(gene_atb.matrix.sum()/gene_atb.size, np.median(gene_atb.matrix.sum(1)/gene_atb.shape[1]), np.median(gene_atb.matrix.sum(0)/gene_atb.shape[0])), flush=True)

# save thresholded data
print('saving thresholded data...', flush=True)
datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.pickle', gene_atb)
datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.txt.gz', gene_atb)

# shuffle the data
print('shuffling data...', flush=True)
gene_atb.reorder(np.random.permutation(gene_atb.shape[0]), 0)
gene_atb.reorder(np.random.permutation(gene_atb.shape[1]), 1)
print(gene_atb, flush=True)

# add hgnc metadata
print('adding hgnc metadata data...', flush=True)
hgncmetadata = mapper.annotate_genes(field='mgd_id', values=gene_atb.rowlabels, metadatapath='../../mappings/hgnc/hgnc_20181016_complete_set.txt', drop_duplicates=True)
gene_atb.rowmeta.update(hgncmetadata)
gene_atb.rowname = 'ensembl_gene_id'
gene_atb.rowlabels = gene_atb.rowmeta['ensembl_gene_id'].copy()
del gene_atb.rowmeta['ensembl_gene_id']
Beispiel #26
0
        print('WARNING! Rows do not match!', flush=True)

# print row metadata
print('printing row metadata...', flush=True)
for k,v in dataset.rowmeta.items():
    print(k, v.shape, v.dtype, v[:3], flush=True)

# print column metadata
print('printing column metadata...', flush=True)
for k,v in dataset.columnmeta.items():
    print(k, v.shape, v.dtype, v[:3], flush=True)


# save the data
print('saving data with deconv variables...', flush=True)
datasetIO.save_datamatrix('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.pickle', dataset)
datasetIO.save_datamatrix('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.txt.gz', dataset)
savefolder = '../../input_data/hugolo_transposed_plus_clinical_plus_deconv'
if not os.path.exists(savefolder):
	os.makedirs(savefolder)
datasetIO.save_splitdata(savefolder, dataset)
shutil.copyfile('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.pickle', '{0}/datamatrix.pickle'.format(savefolder))
shutil.copyfile('../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared_plus_clinical_plus_deconv.txt.gz', '{0}/datamatrix.txt.gz'.format(savefolder))

# discard genes
print('discarding genes...', flush=True)
dataset.discard(dataset.columnmeta['is_gene'], 1)
print(dataset, flush=True)

# save the data
print('saving data with only clinical and deconv variables...', flush=True)
Beispiel #27
0
def main():

    # load dataset info
    print('loading dataset info...', flush=True)
    dataset_info_path = 'datasets/candidate_features/dataset_info.txt'
    dataset_infos = datasetIO.load_datasetinfo(dataset_info_path)

    # specify results folder
    print('specifying results folder...', flush=True)
    results_folder = 'datasets/nonredundant_features'
    if not os.path.exists(results_folder):
        os.mkdir(results_folder)

    # iterate over datasets
    print('iterating over datasets...', flush=True)
    for dataset_info in dataset_infos:

        #        # just work with hpatissuesmrna for testing/debugging the pipeline
        #        if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned':
        #            print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True)
        #            continue

        # check if another python instance is already working on this dataset
        if os.path.exists('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation'])):
            print('skipping {0}. already in progress...'.format(
                dataset_info['abbreviation']),
                  flush=True)
            continue

        # log start of processing
        with open('{0}/{1}_in_progress.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            print('working on {0}...'.format(dataset_info['abbreviation']),
                  flush=True)
            fw.write('working on {0}...'.format(dataset_info['abbreviation']))

        # load dataset
        print('loading dataset...', flush=True)
        gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path'])
        gene_atb.columnmeta['isrowstat'] = gene_atb.columnmeta[
            'isrowstat'].astype('int64').astype('bool')

        # decide feature similarity metric
        print('deciding feature similarity metric...', flush=True)
        if ('standardized' in dataset_info['abbreviation']
                or 'cleaned' in dataset_info['abbreviation']
            ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5:
            # dataset is many-valued and filled-in
            print('    dataset is many-valued and filled-in...', flush=True)
            print('    using spearman for similarity...', flush=True)
            dataset_info['feature_similarity_metric'] = 'spearman'
            dataset_info['feature_similarity_threshold'] = np.sqrt(0.5)
        else:
            # dataset is binary or tertiary or sparse
            print('    dataset is binary, tertiary, or sparse...', flush=True)
            print('    using cosine for similarity...', flush=True)
            dataset_info['feature_similarity_metric'] = 'cosine'
            dataset_info['feature_similarity_threshold'] = np.sqrt(0.5)

        # calculate feature similarity
        print('calculating feature similarity...', flush=True)
        atb_atb = gene_atb.tosimilarity(
            axis=1, metric=dataset_info['feature_similarity_metric'])

        # prioritize feature groups
        print('prioritizing feature groups...', flush=True)
        are_similar_features = np.abs(
            atb_atb.matrix) > dataset_info['feature_similarity_threshold']
        feature_group_size = are_similar_features.sum(1).astype('float64')
        feature_group_score = (np.abs(
            atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size
        feature_priority = np.zeros(gene_atb.shape[1], dtype='float64')
        feature_priority[gene_atb.columnlabels == 'mean'] = 1.0
        feature_priority[gene_atb.columnlabels == 'stdv'] = 0.5
        feature_infos = list(
            zip(np.arange(gene_atb.shape[1], dtype='int64'),
                gene_atb.columnlabels.copy(), feature_group_size.copy(),
                feature_priority.copy(), feature_group_score.copy()))
        feature_infos.sort(key=itemgetter(4), reverse=True)
        feature_infos.sort(key=itemgetter(3), reverse=True)
        feature_infos.sort(key=itemgetter(2), reverse=True)
        #        for feature_info in feature_infos:
        #            print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4]))
        sorted_feature_indices = np.array(
            [feature_info[0] for feature_info in feature_infos], dtype='int64')
        atb_atb.reorder(sorted_feature_indices, axis=0)
        atb_atb.reorder(sorted_feature_indices, axis=1)
        gene_atb.reorder(sorted_feature_indices, axis=1)
        are_similar_features = are_similar_features[
            sorted_feature_indices, :][:, sorted_feature_indices]

        # group similar features
        print('grouping similar features...', flush=True)
        tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool')
        gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1],
                                                          '',
                                                          dtype='object')
        gene_atb.columnmeta['preferred_rowstat'] = np.full(gene_atb.shape[1],
                                                           '',
                                                           dtype='object')
        rowstats = gene_atb.columnlabels[gene_atb.columnmeta['isrowstat']]
        with open('{0}/{1}_feature_groups.txt'.format(
                results_folder, dataset_info['abbreviation']),
                  mode='wt',
                  encoding='utf-8',
                  errors='surrogateescape') as fw:
            for i, feature in enumerate(gene_atb.columnlabels):
                if ~tobediscarded[i]:
                    # find similar features
                    print('    finding features similar to feature "{0}"...'.
                          format(feature),
                          flush=True)
                    similarity_hit = are_similar_features[i, :]
                    similarity_hit = np.logical_and(
                        similarity_hit, ~tobediscarded)  # just what's new
                    similarity_hit[:i] = False
                    similar_features = gene_atb.columnlabels[similarity_hit]
                    similarity_values = atb_atb.matrix[i, similarity_hit]
                    rowstat_is_in_group = np.in1d(rowstats, similar_features)
                    gene_atb.columnmeta['similar_features'][i] = '|'.join(
                        similar_features.tolist())
                    if rowstat_is_in_group.any():
                        # replace feature with summary stat
                        gene_atb.columnmeta['preferred_rowstat'][i] = rowstats[
                            rowstat_is_in_group.nonzero()[0][0]]
                        gene_atb.matrix[:, i] = gene_atb.select(
                            [], gene_atb.columnmeta['preferred_rowstat'][i])
                        print(
                            '        replacing feature "{0}" with summary stat "{1}"...'
                            .format(
                                feature,
                                gene_atb.columnmeta['preferred_rowstat'][i]),
                            flush=True)
                    elif similarity_hit.sum() > 1:
                        # replace feature with group average
                        print(
                            '        replacing feature "{0}" with average of {1!s} features...'
                            .format(feature, similarity_hit.sum()),
                            flush=True)
                        feature_weight = atb_atb.matrix[i, similarity_hit]
                        feature_weight = feature_weight / np.sum(
                            np.abs(feature_weight))
                        gene_atb.matrix[:, i] = (
                            gene_atb.matrix[:, similarity_hit] *
                            (feature_weight.reshape(1, -1))).sum(1)
                    else:
                        print('        no similar features...', flush=True)
                    fw.write('\t'.join([
                        '{0}|{1:1.6g}'.format(f, v)
                        for f, v in zip(similar_features, similarity_values)
                    ]) + '\n')
                    similarity_hit[i] = False
                    tobediscarded = np.logical_or(tobediscarded,
                                                  similarity_hit)

        # discard features absorbed into group features
        print('discarding features absorbed into group features...',
              flush=True)
        if tobediscarded.any():
            # discard features
            print('    discarding {0!s} features. {1!s} features remaining...'.
                  format(tobediscarded.sum(), (~tobediscarded).sum()),
                  flush=True)
            gene_atb.discard(tobediscarded, axis=1)
        else:
            # keep all features
            print('    no features to discard. {0!s} features remaining...'.
                  format(gene_atb.shape[1]),
                  flush=True)

        # save if dataset has content
        print('saving if dataset has content...', flush=True)
        if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0:
            # no content
            print('    nothing to save...', flush=True)
        else:
            # save nonredundant features
            print('    saving {0!s} nonredundant features...'.format(
                gene_atb.shape[1]),
                  flush=True)
            dataset_info['path'] = '{0}/{1}.txt.gz'.format(
                results_folder, dataset_info['abbreviation'])
            dataset_info['nonredundant_genes'] = gene_atb.shape[0]
            dataset_info['nonredundant_features'] = gene_atb.shape[1]
            datasetIO.save_datamatrix(dataset_info['path'], gene_atb)
            datasetIO.append_datasetinfo(
                '{0}/dataset_info.txt'.format(results_folder), dataset_info)

    print('done.', flush=True)
Beispiel #28
0
def main(d):
    # d is a dictionary containing the auto-encoder design specifications and training phase specifications

    # RESET DEFAULT GRAPH
    print('resetting default graph...', flush=True)
    tf.reset_default_graph()

    # FINISH CONFIGURATION
    print('finishing configuration...', flush=True)

    # specify distribution of initial weights
    if d['initialization_distribution'] == 'truncnorm':
        initialization_distribution = tf.truncated_normal

    # specify activation function
    if d['activation_function'] == 'tanh':
        activation_function = {'tf': tf.tanh, 'np': tsdae_apply_functions.tanh}
    elif d['activation_function'] == 'relu':
        activation_function = {
            'tf': tf.nn.relu,
            'np': tsdae_apply_functions.relu
        }
    elif d['activation_function'] == 'elu':
        activation_function = {
            'tf': tf.nn.elu,
            'np': tsdae_apply_functions.elu
        }
    elif d['activation_function'] == 'sigmoid':
        activation_function = {
            'tf': tf.sigmoid,
            'np': tsdae_apply_functions.sigmoid
        }

    # load data
    partitions = ['train', 'valid', 'test']
    dataset = {}
    for partition in partitions:
        dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(
            d['input_path'], partition))
        d['{0}_examples'.format(partition)] = dataset[partition].shape[0]

    # get loss weights
    # we have features with mixed variable types and mixed missingness
    # strategy is to apply weights do the data points such that each feature has total weight of 1
    # for binary features (columnmeta['likelihood'] == 'bernoulli'), balance the weight on the positive and negative classes
    # for other features, uniform weight
    zero = 0.
    half = 0.5
    one = 1.
    posweights = 1 / 2 / (1 +
                          np.nansum(dataset['train'].matrix, 0, keepdims=True))
    posweights[:, dataset['train'].
               columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum(
                   ~np.isnan(dataset['train'].
                             matrix[:, dataset['train'].
                                    columnmeta['likelihood'] != 'bernoulli']),
                   0,
                   keepdims=True)
    negweights = 1 / 2 / (
        1 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True) -
        np.nansum(dataset['train'].matrix, 0, keepdims=True))
    negweights[:, dataset['train'].
               columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum(
                   ~np.isnan(dataset['train'].
                             matrix[:, dataset['train'].
                                    columnmeta['likelihood'] != 'bernoulli']),
                   0,
                   keepdims=True)
    print('posweights nan:', np.isnan(posweights).any(), flush=True)
    print('negweights nan:', np.isnan(negweights).any(), flush=True)
    u_dataset, c_dataset = np.unique(dataset['train'].columnmeta['dataset'],
                                     return_counts=True)
    datasetweights = np.zeros((1, dataset['train'].shape[1]), dtype='float64')
    for dataset_name, dataset_count in zip(u_dataset, c_dataset):
        datasetweights[:, dataset['train'].columnmeta['dataset'] ==
                       dataset_name] = 1 / u_dataset.size / dataset_count

    # get parameters for marginal distributions
    # will sample from marginal distributions to impute missing values
    # as well as to replace known values with corrupted values
    # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli')
    # for other features, model as gaussian
    marginalprobabilities = (
        1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / (
            2 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True)
        )  # posterior mean of beta-bernoulli with prior a=b=1
    marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True)
    isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] ==
                           'bernoulli').astype('float64').reshape(1, -1)
    print('marginalprobabilities nan:',
          np.isnan(marginalprobabilities).any(),
          flush=True)
    print('marginalstdvs nan:', np.isnan(marginalstdvs).any(), flush=True)
    print('isbernoullimarginal nan:',
          np.isnan(isbernoullimarginal).any(),
          flush=True)

    # assign friendly nan value
    nanvalue = -666.666
    for partition in partitions:
        dataset[partition].matrix[np.isnan(
            dataset[partition].matrix)] = nanvalue

    # create output directory
    if not os.path.exists(d['output_path']):
        os.makedirs(d['output_path'])

    # initialize model architecture (number of layers and dimension of each layer)
    d['current_dimensions'] = d[
        'all_dimensions'][:d['current_hidden_layer'] +
                          1]  # dimensions of model up to current depth

    # specify embedding function for current training phase
    # we want the option of skipping the embedding activation function to apply only to the full model
    if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[
            'all_dimensions']:
        d['current_apply_activation_to_embedding'] = False
    else:
        d['current_apply_activation_to_embedding'] = True

    # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent
    d['batch_size'] = d['batch_fraction'] * d['train_examples']
    batch_ids = create_batch_ids(d['train_examples'], d['batch_size'])
    d['batches'] = np.unique(batch_ids).size
    d['steps'] = d['current_epochs'] * d['batches']

    # specify path to weights from previous training run
    d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
        d['output_path'], d['previous_hidden_layer'],
        d['previous_finetuning_run'])
    d['fix_or_init'] = 'fix' if d[
        'current_finetuning_run'] == 0 else 'init'  # fix for pretraining, init for finetuning

    # specify rows and columns of figure showing data reconstructions
    d['reconstruction_rows'] = int(
        np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2)))
    d['reconstruction_cols'] = 2 * d['reconstruction_rows']

    # print some design information
    print('input path: {0}'.format(d['input_path']), flush=True)
    print('output path: {0}'.format(d['output_path']), flush=True)
    print('previous variables path: {0}'.format(d['previous_variables_path']),
          flush=True)
    print('previous variables fix or init: {0}'.format(d['fix_or_init']),
          flush=True)

    # SAVE CURRENT DESIGN
    print('saving current design...', flush=True)
    with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format(
            d['output_path'], d['current_hidden_layer'],
            d['current_finetuning_run']),
              mode='wt',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        json.dump(d, fw, indent=2)

    # DEFINE REPORTING VARIABLES
    print('defining reporting variables...', flush=True)
    reporting_steps = tsdae_design_functions.create_reporting_steps(
        d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint'])
    valid_losses = np.zeros(reporting_steps.size, dtype='float32')
    train_losses = np.zeros(reporting_steps.size, dtype='float32')
    valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32')
    train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32')
    valid_losses_normal = np.zeros(reporting_steps.size, dtype='float32')
    train_losses_normal = np.zeros(reporting_steps.size, dtype='float32')
    valid_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32')
    train_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32')
    valid_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32')
    train_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32')
    valid_noisy_losses_bernoulli = np.zeros(reporting_steps.size,
                                            dtype='float32')
    train_noisy_losses_bernoulli = np.zeros(reporting_steps.size,
                                            dtype='float32')
    print('reporting steps:', reporting_steps, flush=True)

    # DEFINE COMPUTATIONAL GRAPH
    # define placeholders for input data, use None to allow feeding different numbers of examples
    print('defining placeholders...', flush=True)
    training = tf.placeholder(tf.bool, [])
    noise_prob = tf.placeholder(tf.float32, [])
    training_and_validation_data_initializer = tf.placeholder(
        tf.float32, [
            dataset['train'].shape[0] + dataset['valid'].shape[0],
            dataset['train'].shape[1]
        ])
    selection_mask = tf.placeholder(
        tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]])
    pos_weights_initializer = tf.placeholder(tf.float32,
                                             [1, dataset['train'].shape[1]])
    neg_weights_initializer = tf.placeholder(tf.float32,
                                             [1, dataset['train'].shape[1]])
    dataset_weights_initializer = tf.placeholder(
        tf.float32, [1, dataset['train'].shape[1]])
    marginal_probabilities_initializer = tf.placeholder(
        tf.float32, [1, dataset['train'].shape[1]])
    marginal_stdvs_initializer = tf.placeholder(tf.float32,
                                                [1, dataset['train'].shape[1]])
    is_bernoulli_marginal_initializer = tf.placeholder(
        tf.float32, [1, dataset['train'].shape[1]])
    zero_initializer = tf.placeholder(tf.float32, [])
    half_initializer = tf.placeholder(tf.float32, [])
    one_initializer = tf.placeholder(tf.float32, [])
    nan_value_initializer = tf.placeholder(tf.float32, [])

    # define variables
    # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding
    print('defining variables...', flush=True)
    training_and_validation_data = tf.Variable(
        training_and_validation_data_initializer,
        trainable=False,
        collections=[])
    pos_weights = tf.Variable(pos_weights_initializer,
                              trainable=False,
                              collections=[])
    neg_weights = tf.Variable(neg_weights_initializer,
                              trainable=False,
                              collections=[])
    dataset_weights = tf.Variable(dataset_weights_initializer,
                                  trainable=False,
                                  collections=[])
    marginal_probabilities = tf.Variable(marginal_probabilities_initializer,
                                         trainable=False,
                                         collections=[])
    marginal_stdvs = tf.Variable(marginal_stdvs_initializer,
                                 trainable=False,
                                 collections=[])
    is_bernoulli_marginal = tf.Variable(is_bernoulli_marginal_initializer,
                                        trainable=False,
                                        collections=[])
    zero_ = tf.Variable(zero_initializer, trainable=False, collections=[])
    half_ = tf.Variable(half_initializer, trainable=False, collections=[])
    one_ = tf.Variable(one_initializer, trainable=False, collections=[])
    nan_value = tf.Variable(nan_value_initializer,
                            trainable=False,
                            collections=[])
    if os.path.exists(d['previous_variables_path']):
        # update variables (if continuing from a previous training run)
        print('loading previous variables...', flush=True)
        global_step, W, bencode, bdecode = update_variables(
            d['current_dimensions'], initialization_distribution,
            d['initialization_sigma'], d['previous_variables_path'],
            d['fix_or_init'], d['include_global_step'])
    elif (d['current_hidden_layer'] == 1 and d['current_finetuning_run']
          == 0) or d['skip_layerwise_training']:
        # create variables
        global_step, W, bencode, bdecode = create_variables(
            d['current_dimensions'], initialization_distribution,
            d['initialization_sigma'])
    else:
        raise ValueError('could not find previous variables')

    # define model
    # h contains the activations from input layer to bottleneck layer
    # hhat contains the activations from bottleneck layer to output layer
    # xhat is a reference to the output layer (i.e. the reconstruction)
    print('defining model...', flush=True)
    x = tf.boolean_mask(training_and_validation_data, selection_mask)
    is_positive = tf.to_float(tf.greater(x, zero_))
    is_missing = tf.to_float(tf.equal(x, nan_value))
    loss_weights = (
        pos_weights * is_positive + neg_weights * (one_ - is_positive)
    ) * (
        one_ - is_missing
    ) * dataset_weights  # missing values won't be included in loss calculation
    loss_weights = loss_weights / tf.reduce_mean(loss_weights)
    normal_loss_weights = loss_weights * (one_ - is_bernoulli_marginal)
    bernoulli_loss_weights = loss_weights * is_bernoulli_marginal
    normal_noise = tf.truncated_normal(tf.shape(x), mean=zero_,
                                       stddev=one_) * marginal_stdvs
    bernoulli_noise = tf.to_float(
        tf.random_uniform(tf.shape(x), minval=zero_, maxval=one_) <=
        marginal_probabilities)
    noise = bernoulli_noise * is_bernoulli_marginal + normal_noise * (
        one_ - is_bernoulli_marginal)
    random_noise_mask = tf.to_float(
        tf.random_uniform(tf.shape(x)) <= noise_prob
    )  # replace missing values and random fraction of known values with noise
    structured_noise_mask = tf.to_float(
        tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)], tf.to_int32(one_)))
        <= noise_prob) * tf.abs(
            tf.to_float(
                tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)],
                                   tf.to_int32(one_))) <= half_) -
            is_bernoulli_marginal)
    noise_mask = random_noise_mask + structured_noise_mask - (
        random_noise_mask * structured_noise_mask)
    x = x + is_missing * (noise - x)
    xnoisy = x + noise_mask * (noise - x)
    h, hhat, xhat_preactivation = create_autoencoder(
        xnoisy, activation_function['tf'], False,
        d['current_apply_activation_to_embedding'], d['use_batchnorm'],
        training, W, bencode, bdecode)
    #    normal_loss = tf.squared_difference(x, xhat_preactivation)
    #    bernoulli_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=x, logits=xhat_preactivation)
    #    loss = tf.reduce_sum(loss_weights*(bernoulli_loss*is_bernoulli_marginal + normal_loss*(one_-is_bernoulli_marginal)))/tf.reduce_sum(loss_weights)
    normal_loss = tf.reduce_sum(normal_loss_weights * tf.squared_difference(
        x, xhat_preactivation)) / tf.reduce_sum(normal_loss_weights)
    bernoulli_loss = tf.reduce_sum(
        bernoulli_loss_weights * tf.nn.sigmoid_cross_entropy_with_logits(
            labels=x,
            logits=xhat_preactivation)) / tf.reduce_sum(bernoulli_loss_weights)
    loss = normal_loss + bernoulli_loss

    # define optimizer and training function
    print('defining optimizer and training function...', flush=True)
    optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'],
                                       epsilon=d['epsilon'],
                                       beta1=d['beta1'],
                                       beta2=d['beta2'])
    train_ops = optimizer.minimize(loss, global_step=global_step)

    # define update ops and add to train ops (if using batch norm)
    if d['use_batchnorm']:
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_ops = [train_ops, update_ops]

    # collect batch norm variables
    if d['use_batchnorm']:
        bn_gammas = tf.global_variables(
            scope='batch_normalization.{0,2}/gamma:0')
        print(bn_gammas, flush=True)
        bn_betas = tf.global_variables(
            scope='batch_normalization.{0,2}/beta:0')
        bn_moving_means = tf.global_variables(
            scope='batch_normalization.{0,2}/moving_mean:0')
        bn_moving_variances = tf.global_variables(
            scope='batch_normalization.{0,2}/moving_variance:0')

    # define bottleneck layer preactivation
#    bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1]

# INITIALIZE TENSORFLOW SESSION
    print('initializing tensorflow session...', flush=True)
    init = tf.global_variables_initializer()
    session_config = configure_session(d['processor'],
                                       d['gpu_memory_fraction'])
    with tf.Session(config=session_config) as sess:
        sess.run(init)

        # TRAINING
        print('training...', flush=True)
        sess.run(training_and_validation_data.initializer,
                 feed_dict={
                     training_and_validation_data_initializer:
                     np.append(dataset['train'].matrix,
                               dataset['valid'].matrix, 0)
                 })
        sess.run(pos_weights.initializer,
                 feed_dict={pos_weights_initializer: posweights})
        sess.run(neg_weights.initializer,
                 feed_dict={neg_weights_initializer: negweights})
        sess.run(dataset_weights.initializer,
                 feed_dict={dataset_weights_initializer: datasetweights})
        sess.run(marginal_probabilities.initializer,
                 feed_dict={
                     marginal_probabilities_initializer: marginalprobabilities
                 })
        sess.run(marginal_stdvs.initializer,
                 feed_dict={marginal_stdvs_initializer: marginalstdvs})
        sess.run(
            is_bernoulli_marginal.initializer,
            feed_dict={is_bernoulli_marginal_initializer: isbernoullimarginal})
        sess.run(zero_.initializer, feed_dict={zero_initializer: zero})
        sess.run(half_.initializer, feed_dict={half_initializer: half})
        sess.run(one_.initializer, feed_dict={one_initializer: one})
        sess.run(nan_value.initializer,
                 feed_dict={nan_value_initializer: nanvalue})
        validation_id = -1
        batch_and_validation_ids = np.full(dataset['train'].shape[0] +
                                           dataset['valid'].shape[0],
                                           validation_id,
                                           dtype=batch_ids.dtype)
        is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'),
                             np.zeros(dataset['valid'].shape[0], dtype='bool'))
        is_valid = ~is_train
        training_step = 0
        i = 0
        overfitting_score = 0
        stopearly = False
        starttime = time.time()

        with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']),
                  mode='wt',
                  buffering=1) as fl:
            fl.write('\t'.join([
                'step', 'train_loss', 'valid_loss', 'train_noisy_loss',
                'valid_noisy_loss', 'train_loss_normal', 'valid_loss_normal',
                'train_noisy_loss_normal', 'valid_noisy_loss_normal',
                'train_loss_bernoulli', 'valid_loss_bernoulli',
                'train_noisy_loss_bernoulli', 'valid_noisy_loss_bernoulli',
                'time'
            ]) + '\n')

            for epoch in range(d['current_epochs']):
                if stopearly:
                    break

                # randomize assignment of training examples to batches
                np.random.shuffle(batch_ids)
                batch_and_validation_ids[is_train] = batch_ids

                for batch in range(d['batches']):
                    training_step += 1

                    # select mini-batch
                    selected = batch_and_validation_ids == batch

                    # update weights
                    sess.run(train_ops,
                             feed_dict={
                                 training: True,
                                 selection_mask: selected,
                                 noise_prob: d['noise_probability']
                             })

                    # record training and validation errors
                    if training_step == reporting_steps[i]:
                        train_losses[i], train_losses_normal[
                            i], train_losses_bernoulli[i] = sess.run(
                                [loss, normal_loss, bernoulli_loss],
                                feed_dict={
                                    training: False,
                                    selection_mask: is_train,
                                    noise_prob: 0
                                })
                        train_noisy_losses[i], train_noisy_losses_normal[
                            i], train_noisy_losses_bernoulli[i] = sess.run(
                                [loss, normal_loss, bernoulli_loss],
                                feed_dict={
                                    training: False,
                                    selection_mask: is_train,
                                    noise_prob: d['noise_probability']
                                })
                        valid_losses[i], valid_losses_normal[
                            i], valid_losses_bernoulli[i] = sess.run(
                                [loss, normal_loss, bernoulli_loss],
                                feed_dict={
                                    training: False,
                                    selection_mask: is_valid,
                                    noise_prob: 0
                                })
                        valid_noisy_losses[i], valid_noisy_losses_normal[
                            i], valid_noisy_losses_bernoulli[i] = sess.run(
                                [loss, normal_loss, bernoulli_loss],
                                feed_dict={
                                    training: False,
                                    selection_mask: is_valid,
                                    noise_prob: d['noise_probability']
                                })
                        print(
                            'step:{0:1.6g}, trn:{1:1.3g}, vld:{2:1.3g}, trnn:{3:1.3g}, vldn:{4:1.3g}, trnN:{5:1.3g}, vldN:{6:1.3g}, trnnN:{7:1.3g}, vldnN:{8:1.3g}, trnB:{9:1.3g}, vldB:{10:1.3g}, trnnB:{11:1.3g}, vldnB:{12:1.3g}, time:{13:1.6g}'
                            .format(reporting_steps[i], train_losses[i],
                                    valid_losses[i], train_noisy_losses[i],
                                    valid_noisy_losses[i],
                                    train_losses_normal[i],
                                    valid_losses_normal[i],
                                    train_noisy_losses_normal[i],
                                    valid_noisy_losses_normal[i],
                                    train_losses_bernoulli[i],
                                    valid_losses_bernoulli[i],
                                    train_noisy_losses_bernoulli[i],
                                    valid_noisy_losses_bernoulli[i],
                                    time.time() - starttime),
                            flush=True)
                        fl.write('\t'.join([
                            '{0:1.6g}'.format(x) for x in [
                                reporting_steps[i], train_losses[i],
                                valid_losses[i], train_noisy_losses[i],
                                valid_noisy_losses[i], train_losses_normal[i],
                                valid_losses_normal[i],
                                train_noisy_losses_normal[i],
                                valid_noisy_losses_normal[i],
                                train_losses_bernoulli[i],
                                valid_losses_bernoulli[i],
                                train_noisy_losses_bernoulli[i],
                                valid_noisy_losses_bernoulli[i],
                                time.time() - starttime
                            ]
                        ]) + '\n')

                        # save current weights, reconstructions, and projections
                        if training_step >= d[
                                'startsavingstep'] or training_step == reporting_steps[
                                    -1]:
                            with open(
                                    '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                                    .format(d['output_path'],
                                            d['current_hidden_layer'],
                                            d['current_finetuning_run'],
                                            training_step), 'wb') as fw:
                                pickle.dump(
                                    (sess.run(global_step), sess.run(W),
                                     sess.run(bencode), sess.run(bdecode)), fw)
                            if d['use_batchnorm']:
                                with open(
                                        '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                                        .format(d['output_path'],
                                                d['current_hidden_layer'],
                                                d['current_finetuning_run'],
                                                training_step), 'wb') as fw:
                                    pickle.dump(
                                        (sess.run(bn_gammas),
                                         sess.run(bn_betas),
                                         sess.run(bn_moving_means),
                                         sess.run(bn_moving_variances)), fw)

                            # stop early if overfitting
                            if valid_losses[i] >= 1.01 * (np.insert(
                                    valid_losses[:i], 0, np.inf).min()):
                                overfitting_score += 1
                            else:
                                overfitting_score = 0
                            if overfitting_score == d['overfitting_score_max']:
                                stopearly = True
                                print('stopping early!', flush=True)
                                break
                        i += 1

        # end tensorflow session
        print('closing tensorflow session...', flush=True)

    # ROLL BACK IF OVERFITTING
    if stopearly:
        print('rolling back...', flush=True)
        reporting_steps = reporting_steps[:i + 1]
        train_losses = train_losses[:i + 1]
        valid_losses = valid_losses[:i + 1]
        train_noisy_losses = train_noisy_losses[:i + 1]
        valid_noisy_losses = valid_noisy_losses[:i + 1]
#        selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']])
    else:
        print('completed all training steps...', flush=True)
#        selected_step = reporting_steps[-1]
    selected_step = min([
        max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]),
        reporting_steps[-1]
    ])
    print('selected step:{0}...'.format(selected_step), flush=True)

    # SAVE RESULTS
    print('saving results...', flush=True)
    with open(
            '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']), 'wb') as fw:
        pickle.dump(
            {
                'reporting_steps': reporting_steps,
                'valid_losses': valid_losses,
                'train_losses': train_losses,
                'valid_noisy_losses': valid_noisy_losses,
                'train_noisy_losses': train_noisy_losses
            }, fw)
    if d['current_dimensions'] == d['all_dimensions'] and (
            not d['use_finetuning'] or d['current_finetuning_run'] > 0):
        shutil.copyfile(
            '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
            .format(d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run'], selected_step),
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']))
        if d['use_batchnorm']:
            shutil.copyfile(
                '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run']))
    else:
        shutil.move(
            '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
            .format(d['output_path'], d['current_hidden_layer'],
                    d['current_finetuning_run'], selected_step),
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']))
        if d['use_batchnorm']:
            shutil.move(
                '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'
                .format(d['output_path'], d['current_hidden_layer'],
                        d['current_finetuning_run'], selected_step),
                '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run']))
    with open(
            '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(
                d['output_path'], d['current_hidden_layer'],
                d['current_finetuning_run']), 'rb') as fr:
        W, Be, Bd = pickle.load(fr)[1:]  # global_step, W, bencode, bdecode
    if d['use_batchnorm']:
        with open(
                '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run']), 'rb') as fr:
            batchnorm_variables = pickle.load(
                fr)  # gammas, betas, moving_means, moving_variances
        batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables(
            batchnorm_variables, d['current_apply_activation_to_embedding'],
            d['apply_activation_to_output'])
    recon = {}
    embed = {}
    error = {}
    embed_preactivation = {}
    for partition in partitions:
        if d['use_batchnorm']:
            recon[partition], embed[partition], error[
                partition] = tsdae_apply_functions.encode_and_decode(
                    dataset[partition],
                    W,
                    Be,
                    Bd,
                    activation_function['np'],
                    d['current_apply_activation_to_embedding'],
                    d['apply_activation_to_output'],
                    dataset['train'].columnmeta['likelihood'] == 'bernoulli',
                    return_embedding=True,
                    return_reconstruction_error=True,
                    bn_encode_variables=batchnorm_encode_variables,
                    bn_decode_variables=batchnorm_decode_variables)
            embed_preactivation[partition] = tsdae_apply_functions.encode(
                dataset[partition],
                W,
                Be,
                activation_function['np'],
                apply_activation_to_embedding=False,
                bn_variables=batchnorm_encode_variables)
        else:
            recon[partition], embed[partition], error[
                partition] = tsdae_apply_functions.encode_and_decode(
                    dataset[partition],
                    W,
                    Be,
                    Bd,
                    activation_function['np'],
                    d['current_apply_activation_to_embedding'],
                    d['apply_activation_to_output'],
                    dataset['train'].columnmeta['likelihood'] == 'bernoulli',
                    return_embedding=True,
                    return_reconstruction_error=True)
            embed_preactivation[partition] = tsdae_apply_functions.encode(
                dataset[partition],
                W,
                Be,
                activation_function['np'],
                apply_activation_to_embedding=False)
        print('{0} reconstruction error: {1:1.3g}'.format(
            partition, error[partition]),
              flush=True)
        if d['current_dimensions'] == d['all_dimensions'] and (
                not d['use_finetuning'] or d['current_finetuning_run'] > 0):
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            datasetIO.save_datamatrix(
                '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(
                    d['output_path'], partition, d['current_hidden_layer'],
                    d['current_finetuning_run']), embed[partition])
            if d['current_apply_activation_to_embedding']:
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])
                datasetIO.save_datamatrix(
                    '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'
                    .format(d['output_path'], partition,
                            d['current_hidden_layer'],
                            d['current_finetuning_run']),
                    embed_preactivation[partition])

    # PLOT LOSS
    print('plotting loss...', flush=True)
    fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25))
    ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25])
    ax.semilogx(reporting_steps,
                train_losses,
                ':r',
                linewidth=1,
                label='train')
    ax.semilogx(reporting_steps,
                valid_losses,
                '-g',
                linewidth=1,
                label='valid')
    ax.semilogx(reporting_steps,
                train_noisy_losses,
                '--b',
                linewidth=1,
                label='train,noisy')
    ax.semilogx(reporting_steps,
                valid_noisy_losses,
                '-.k',
                linewidth=1,
                label='valid,noisy')
    ax.legend(loc='best', fontsize=8)
    ax.set_ylabel('loss', fontsize=8)
    ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step),
                  fontsize=8)
    ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1)
    # ax.set_ylim(0, 1)
    ax.tick_params(axis='both',
                   which='major',
                   left=True,
                   right=True,
                   bottom=True,
                   top=False,
                   labelleft=True,
                   labelright=False,
                   labelbottom=True,
                   labeltop=False,
                   labelsize=8)
    fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(
        d['output_path'], d['current_hidden_layer'],
        d['current_finetuning_run']),
               transparent=True,
               pad_inches=0,
               dpi=600)
    plt.close()

    # PLOT RECONSTRUCTIONS
    print('plotting reconstructions...', flush=True)
    num_recons = min([
        d['reconstruction_rows'] * d['reconstruction_cols'],
        dataset['valid'].shape[0]
    ])
    x_valid = dataset['valid'].matrix[:num_recons, dataset['train'].
                                      columnmeta['likelihood'] != 'bernoulli']
    xr_valid = recon['valid'].matrix[:num_recons, dataset['train'].
                                     columnmeta['likelihood'] != 'bernoulli']
    if x_valid.shape[1] > 1000:
        x_valid = x_valid[:, :1000]
        xr_valid = xr_valid[:, :1000]
    lb = np.append(x_valid, xr_valid, 1).min(1)
    ub = np.append(x_valid, xr_valid, 1).max(1)
    fg, axs = plt.subplots(2 * d['reconstruction_rows'],
                           d['reconstruction_cols'],
                           figsize=(6.5, 6.5))
    for i, ax in enumerate(
            axs.reshape(-1)[:d['reconstruction_rows'] *
                            d['reconstruction_cols']]):
        if i < num_recons:
            ax.plot(x_valid[i, :],
                    xr_valid[i, :],
                    'ok',
                    markersize=0.5,
                    markeredgewidth=0,
                    alpha=0.1)
            ax.set_ylim(lb[i], ub[i])
            ax.set_xlim(lb[i], ub[i])
            ax.tick_params(axis='both',
                           which='major',
                           left=False,
                           right=False,
                           bottom=False,
                           top=False,
                           labelleft=False,
                           labelright=False,
                           labelbottom=False,
                           labeltop=False,
                           pad=4)
            ax.set_frame_on(False)
            ax.axvline(lb[i], linewidth=1, color='k')
            ax.axvline(ub[i], linewidth=1, color='k')
            ax.axhline(lb[i], linewidth=1, color='k')
            ax.axhline(ub[i], linewidth=1, color='k')
        else:
            fg.delaxes(ax)
    x_valid = dataset['valid'].matrix[:num_recons, dataset['train'].
                                      columnmeta['likelihood'] == 'bernoulli']
    xr_valid = recon['valid'].matrix[:num_recons, dataset['train'].
                                     columnmeta['likelihood'] == 'bernoulli']
    if x_valid.shape[1] > 1000:
        x_valid = x_valid[:, :1000]
        xr_valid = xr_valid[:, :1000]
    x_valid = x_valid.astype('bool')
    lb = -0.05
    ub = 1.05
    for i, ax in enumerate(
            axs.reshape(-1)[d['reconstruction_rows'] *
                            d['reconstruction_cols']:]):
        if i < num_recons:
            ax.boxplot(
                [xr_valid[i, ~x_valid[i, :]], xr_valid[i, x_valid[i, :]]],
                positions=[0.2, 0.8])
            ax.set_ylim(lb, ub)
            ax.set_xlim(lb, ub)
            ax.tick_params(axis='both',
                           which='major',
                           left=False,
                           right=False,
                           bottom=False,
                           top=False,
                           labelleft=False,
                           labelright=False,
                           labelbottom=False,
                           labeltop=False,
                           pad=4)
            ax.set_frame_on(False)
            ax.axvline(lb, linewidth=1, color='k')
            ax.axvline(ub, linewidth=1, color='k')
            ax.axhline(lb, linewidth=1, color='k')
            ax.axhline(ub, linewidth=1, color='k')
        else:
            fg.delaxes(ax)
    fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(
        d['output_path'], d['current_hidden_layer'],
        d['current_finetuning_run']),
               transparent=True,
               pad_inches=0,
               dpi=1200)
    plt.close()

    # PLOT 2D EMBEDDING
    if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or
                                             d['current_finetuning_run'] > 0):
        print('plotting 2d embedding...', flush=True)
        fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
        ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
        ax.plot(embed['train'].matrix[:, 0],
                embed['train'].matrix[:, 1],
                'ok',
                markersize=2,
                markeredgewidth=0,
                alpha=0.5,
                zorder=0)
        ax.plot(embed['valid'].matrix[:, 0],
                embed['valid'].matrix[:, 1],
                'or',
                markersize=2,
                markeredgewidth=0,
                alpha=1.0,
                zorder=1)
        ax.tick_params(axis='both',
                       which='major',
                       bottom=False,
                       top=False,
                       labelbottom=False,
                       labeltop=False,
                       left=False,
                       right=False,
                       labelleft=False,
                       labelright=False,
                       pad=4)
        ax.set_frame_on(False)
        fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(
            d['output_path'], d['current_hidden_layer'],
            d['current_finetuning_run']),
                   transparent=True,
                   pad_inches=0,
                   dpi=600)
        plt.close()

        if d['current_apply_activation_to_embedding']:
            fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5))
            ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5])
            ax.plot(embed_preactivation['train'].matrix[:, 0],
                    embed_preactivation['train'].matrix[:, 1],
                    'ok',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=0.5,
                    zorder=0)
            ax.plot(embed_preactivation['valid'].matrix[:, 0],
                    embed_preactivation['valid'].matrix[:, 1],
                    'or',
                    markersize=2,
                    markeredgewidth=0,
                    alpha=1.0,
                    zorder=1)
            ax.tick_params(axis='both',
                           which='major',
                           bottom=False,
                           top=False,
                           labelbottom=False,
                           labeltop=False,
                           left=False,
                           right=False,
                           labelleft=False,
                           labelright=False,
                           pad=4)
            ax.set_frame_on(False)
            fg.savefig(
                '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'.
                format(d['output_path'], d['current_hidden_layer'],
                       d['current_finetuning_run']),
                transparent=True,
                pad_inches=0,
                dpi=600)
            plt.close()

    print('done training phase.', flush=True)

    return d['current_hidden_layer'], d['current_finetuning_run'], d[
        'current_epochs']
Beispiel #29
0
hit = np.in1d(sample_metadata['sample_id'], chosen_samples)
for field, values in sample_metadata.items():
    sample_metadata[field] = values[hit]
run_ids = run_ids[hit]

matrix = matrix = np.loadtxt(
    '../../original_data/GTEXv6plus/counts_gene.tsv.gz',
    dtype='float64',
    delimiter='\t',
    skiprows=1,
    usecols=hit.nonzero()[0],
    ndmin=2)

gene_tissue = dataclasses.datamatrix(
    rowname='ensembl_gene_id',
    rowlabels=ensembl_gene_ids,
    rowmeta={},
    columnname='recount2_run_id',
    columnlabels=run_ids,
    columnmeta=sample_metadata,
    matrixname='recount2_processed_rnaseq_counts_from_gtexv6',
    matrix=matrix)

datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle',
    gene_tissue)
datasetIO.save_datamatrix(
    '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.txt.gz',
    gene_tissue)
Beispiel #30
0
                                           fillvalue=np.nan)
            gene_rep = gene_rep.tolabels(rowlabels=all_genes, fillvalue=np.nan)
            gene_rep.append(gene_fold, 1)
        R += 1
        print('    rep {0:1.3g} folds {1:1.3g} auroc {2:1.3g} auprc {3:1.3g}'.
              format(validation_rep, F,
                     stat_rep.select('auroc', [])[validation_rep],
                     stat_rep.select('auprc', [])[validation_rep]),
              flush=True)

stat_fold.discard((stat_fold.matrix == 0).all(0), 1)
stat_rep.discard((stat_rep.matrix == 0).all(0), 1)

# save cross-validation performance stats for folds and reps
print('saving cross-validation performance stats for folds and reps...',
      flush=True)
datasetIO.save_datamatrix(
    'datasets/useful_features/stat_fold_crossvalidation.pickle', stat_fold)
datasetIO.save_datamatrix(
    'datasets/useful_features/stat_fold_crossvalidation.txt.gz', stat_fold)
datasetIO.save_datamatrix(
    'datasets/useful_features/stat_rep_crossvalidation.pickle', stat_rep)
datasetIO.save_datamatrix(
    'datasets/useful_features/stat_rep_crossvalidation.txt.gz', stat_rep)
datasetIO.save_datamatrix(
    'datasets/useful_features/gene_rep_crossvalidation.pickle', gene_rep)
datasetIO.save_datamatrix(
    'datasets/useful_features/gene_rep_crossvalidation.txt.gz', gene_rep)

print('done.', flush=True)