def main(study_name, row_data_path=None, column_data_path=None, matrix_data_path=None, partition_axis=-1, dtype='float64', delimiter='\t', test_fraction=0.1, valid_fraction=0.1, save_text_files=False): # load data and create datamatrix object if row_data_path==None or column_data_path==None or matrix_data_path==None: if os.path.exists('data/original_data/{0}'.format(study_name)): original_files = os.listdir('data/original_data/{0}'.format(study_name)) rowhit = ['rowdata.txt' in x for x in original_files] columnhit = ['columndata.txt' in x for x in original_files] matrixhit = ['matrixdata.txt' in x for x in original_files] if sum(rowhit) > 0 and sum(columnhit) > 0 and sum(matrixhit) > 0: row_data_path = 'data/original_data/{0}/{1}'.format(study_name, original_files[rowhit.index(True)]) column_data_path = 'data/original_data/{0}/{1}'.format(study_name, original_files[columnhit.index(True)]) matrix_data_path = 'data/original_data/{0}/{1}'.format(study_name, original_files[matrixhit.index(True)]) print('LOADING DATA...', flush=True) print(' row_data_path: {0}'.format(row_data_path), flush=True) print(' column_data_path: {0}'.format(column_data_path), flush=True) print(' matrix_data_path: {0}'.format(matrix_data_path), flush=True) dataset = datasetIO.load_splitdata(row_data_path, column_data_path, matrix_data_path, study_name, dtype, delimiter) else: hit = ['datamatrix' in x for x in original_files] if sum(hit) > 0: datamatrix_path = 'data/original_data/{0}/{1}'.format(study_name, original_files[hit.index(True)]) print('LOADING DATA...', flush=True) print(' datamatrix_path: {0}'.format(datamatrix_path), flush=True) dataset = datasetIO.load_datamatrix(datamatrix_path) else: raise ValueError('input data incorrectly specified') else: raise ValueError('input data incorrectly specified') else: print('LOADING DATA...', flush=True) print(' row_data_path: {0}'.format(row_data_path), flush=True) print(' column_data_path: {0}'.format(column_data_path), flush=True) print(' matrix_data_path: {0}'.format(matrix_data_path), flush=True) dataset = datasetIO.load_splitdata(row_data_path, column_data_path, matrix_data_path, study_name, dtype, delimiter) print('ORIGINAL', flush=True) print(dataset) tobediscarded = np.logical_or(dataset.matrix.sum(0) < 0.01*dataset.shape[0], dataset.matrix.sum(0) > 0.99*dataset.shape[0]) dataset.discard(tobediscarded, 1) tobediscarded = dataset.matrix.sum(1) == 0 dataset.discard(tobediscarded, 0) print(dataset) rowsums = dataset.matrix.sum(1, keepdims=True) colsums = dataset.matrix.sum(0, keepdims=True) dataset.matrix = dataset.matrix/np.sqrt(rowsums*colsums) # shuffle the data dataset.reorder(np.random.permutation(dataset.shape[0]), 0) dataset.reorder(np.random.permutation(dataset.shape[1]), 1) # partition the data if partition_axis == 1 or partition_axis == -1: print('PARTITIONING TRANSPOSE...', flush=True) create_and_save_partitions(dataset.totranspose(), study_name, test_fraction, valid_fraction, save_text_files) if partition_axis == 0 or partition_axis == -1: print('PARTITIONING ORIGINAL...', flush=True) create_and_save_partitions(dataset, study_name, test_fraction, valid_fraction, save_text_files)
def main(dictionary, year, datestamp, min_score): print('begin extract_term_marginal_counts_from_termite.py') print('dictionary: {0}'.format(dictionary)) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionary # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionary # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term) # write marginal counts to file print('writing marginal counts...') metalabels = sorted(list(term_term.rowmeta.keys())) with open( '{0}_term_marginal_pmidcounts_year_{1}_datestamp_{2}_minscore_{3!s}.txt' .format(dictionary, year, datestamp, min_score), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = [term_term.rowname] + metalabels fw.write('\t'.join(writelist) + '\n') for i, rowlabel in enumerate(term_term.rowlabels): writelist = [rowlabel] + [ term_term.rowmeta[k][i] if term_term.rowmeta[k].dtype == 'object' else str(term_term.rowmeta[k][i]) for k in metalabels ] fw.write('\t'.join(writelist) + '\n') print('done extract_term_marginal_counts_from_termite.py')
for validation_rep in range(validation_reps): Y = np.zeros(0, dtype='bool') P = np.zeros(0, dtype='float64') F = 0 for validation_fold in range(validation_folds): gene_model_path = 'datasets/useful_features/rep{0!s}_fold{1!s}/gene_model_selected.txt.gz'.format( validation_rep, validation_fold) stat_model_path = 'datasets/useful_features/rep{0!s}_fold{1!s}/stat_model_selected.txt.gz'.format( validation_rep, validation_fold) if os.path.exists(gene_model_path): # load predictions for validation and unlabelled examples print( 'loading predictions for validation and unlabelled examples...', flush=True) gene_model = datasetIO.load_datamatrix(gene_model_path) stat_model = datasetIO.load_datamatrix(stat_model_path) isunknown = gene_model.rowmeta['class'] == 'unknown' Yf = gene_model.rowmeta['class'][~isunknown] == 'positive' Pf = gene_model.matrix[~isunknown, :].reshape(-1) # evaluate performance of predictions on individual fold print( 'evaluating performance of predictions on individual fold...', flush=True) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Yf, P=Pf, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True)
def main(): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/candidate_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/nonredundant_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) gene_atb.columnmeta['isrowstat'] = gene_atb.columnmeta[ 'isrowstat'].astype('int64').astype('bool') # decide feature similarity metric print('deciding feature similarity metric...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' using spearman for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'spearman' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' using cosine for similarity...', flush=True) dataset_info['feature_similarity_metric'] = 'cosine' dataset_info['feature_similarity_threshold'] = np.sqrt(0.5) # calculate feature similarity print('calculating feature similarity...', flush=True) atb_atb = gene_atb.tosimilarity( axis=1, metric=dataset_info['feature_similarity_metric']) # prioritize feature groups print('prioritizing feature groups...', flush=True) are_similar_features = np.abs( atb_atb.matrix) > dataset_info['feature_similarity_threshold'] feature_group_size = are_similar_features.sum(1).astype('float64') feature_group_score = (np.abs( atb_atb.matrix) * are_similar_features).sum(1) / feature_group_size feature_priority = np.zeros(gene_atb.shape[1], dtype='float64') feature_priority[gene_atb.columnlabels == 'mean'] = 1.0 feature_priority[gene_atb.columnlabels == 'stdv'] = 0.5 feature_infos = list( zip(np.arange(gene_atb.shape[1], dtype='int64'), gene_atb.columnlabels.copy(), feature_group_size.copy(), feature_priority.copy(), feature_group_score.copy())) feature_infos.sort(key=itemgetter(4), reverse=True) feature_infos.sort(key=itemgetter(3), reverse=True) feature_infos.sort(key=itemgetter(2), reverse=True) # for feature_info in feature_infos: # print('{0:1.3g}, {1}, {2:1.3g}, {3:1.3g}, {4:1.3g}'.format(feature_info[0], feature_info[1], feature_info[2], feature_info[3], feature_info[4])) sorted_feature_indices = np.array( [feature_info[0] for feature_info in feature_infos], dtype='int64') atb_atb.reorder(sorted_feature_indices, axis=0) atb_atb.reorder(sorted_feature_indices, axis=1) gene_atb.reorder(sorted_feature_indices, axis=1) are_similar_features = are_similar_features[ sorted_feature_indices, :][:, sorted_feature_indices] # group similar features print('grouping similar features...', flush=True) tobediscarded = np.zeros(gene_atb.shape[1], dtype='bool') gene_atb.columnmeta['similar_features'] = np.full(gene_atb.shape[1], '', dtype='object') gene_atb.columnmeta['preferred_rowstat'] = np.full(gene_atb.shape[1], '', dtype='object') rowstats = gene_atb.columnlabels[gene_atb.columnmeta['isrowstat']] with open('{0}/{1}_feature_groups.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: for i, feature in enumerate(gene_atb.columnlabels): if ~tobediscarded[i]: # find similar features print(' finding features similar to feature "{0}"...'. format(feature), flush=True) similarity_hit = are_similar_features[i, :] similarity_hit = np.logical_and( similarity_hit, ~tobediscarded) # just what's new similarity_hit[:i] = False similar_features = gene_atb.columnlabels[similarity_hit] similarity_values = atb_atb.matrix[i, similarity_hit] rowstat_is_in_group = np.in1d(rowstats, similar_features) gene_atb.columnmeta['similar_features'][i] = '|'.join( similar_features.tolist()) if rowstat_is_in_group.any(): # replace feature with summary stat gene_atb.columnmeta['preferred_rowstat'][i] = rowstats[ rowstat_is_in_group.nonzero()[0][0]] gene_atb.matrix[:, i] = gene_atb.select( [], gene_atb.columnmeta['preferred_rowstat'][i]) print( ' replacing feature "{0}" with summary stat "{1}"...' .format( feature, gene_atb.columnmeta['preferred_rowstat'][i]), flush=True) elif similarity_hit.sum() > 1: # replace feature with group average print( ' replacing feature "{0}" with average of {1!s} features...' .format(feature, similarity_hit.sum()), flush=True) feature_weight = atb_atb.matrix[i, similarity_hit] feature_weight = feature_weight / np.sum( np.abs(feature_weight)) gene_atb.matrix[:, i] = ( gene_atb.matrix[:, similarity_hit] * (feature_weight.reshape(1, -1))).sum(1) else: print(' no similar features...', flush=True) fw.write('\t'.join([ '{0}|{1:1.6g}'.format(f, v) for f, v in zip(similar_features, similarity_values) ]) + '\n') similarity_hit[i] = False tobediscarded = np.logical_or(tobediscarded, similarity_hit) # discard features absorbed into group features print('discarding features absorbed into group features...', flush=True) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save nonredundant features print(' saving {0!s} nonredundant features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['nonredundant_genes'] = gene_atb.shape[0] dataset_info['nonredundant_features'] = gene_atb.shape[1] datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(study_name, meta_label, test_groups, pretest_groups, valid_groups, row_data_path=None, column_data_path=None, matrix_data_path=None, partition_axis=0, dtype='float64', delimiter='\t', save_text_files=True): print('study_name: {0}'.format(study_name), flush=True) print('meta_label: {0}'.format(meta_label), flush=True) print('test_groups:', test_groups, flush=True) print('pretest_groups:', pretest_groups, flush=True) print('valid_groups:', valid_groups, flush=True) # load data and create datamatrix object if row_data_path == None or column_data_path == None or matrix_data_path == None: loadfolder = '../input_data/{0}'.format(study_name) if os.path.exists(loadfolder): original_files = os.listdir(loadfolder) rowhit = ['rowdata.txt' in x for x in original_files] columnhit = ['columndata.txt' in x for x in original_files] matrixhit = ['matrixdata.txt' in x for x in original_files] dmpicklehit = [ 'datamatrix' in x and '.pickle' in x for x in original_files ] dmtexthit = [ 'datamatrix' in x and ('.txt' in x or '.csv' in x) for x in original_files ] if sum(dmpicklehit) > 0: datamatrix_path = '{0}/{1}'.format( loadfolder, original_files[dmpicklehit.index(True)]) print('LOADING DATA...', flush=True) print(' datamatrix_path: {0}'.format(datamatrix_path), flush=True) dataset = datasetIO.load_datamatrix(datamatrix_path) elif sum(dmtexthit) > 0: datamatrix_path = '{0}/{1}'.format( loadfolder, original_files[dmtexthit.index(True)]) print('LOADING DATA...', flush=True) print(' datamatrix_path: {0}'.format(datamatrix_path), flush=True) dataset = datasetIO.load_datamatrix(datamatrix_path) elif sum(rowhit) > 0 and sum(columnhit) > 0 and sum(matrixhit) > 0: row_data_path = '{0}/{1}'.format( loadfolder, original_files[rowhit.index(True)]) column_data_path = '{0}/{1}'.format( loadfolder, original_files[columnhit.index(True)]) matrix_data_path = '{0}/{1}'.format( loadfolder, original_files[matrixhit.index(True)]) print('LOADING DATA...', flush=True) print(' row_data_path: {0}'.format(row_data_path), flush=True) print(' column_data_path: {0}'.format(column_data_path), flush=True) print(' matrix_data_path: {0}'.format(matrix_data_path), flush=True) dataset = datasetIO.load_splitdata(row_data_path, column_data_path, matrix_data_path, study_name, dtype, delimiter) else: raise ValueError('input data incorrectly specified') else: raise ValueError('input data incorrectly specified') else: print('LOADING DATA...', flush=True) print(' row_data_path: {0}'.format(row_data_path), flush=True) print(' column_data_path: {0}'.format(column_data_path), flush=True) print(' matrix_data_path: {0}'.format(matrix_data_path), flush=True) dataset = datasetIO.load_splitdata(row_data_path, column_data_path, matrix_data_path, study_name, dtype, delimiter) print('ORIGINAL', flush=True) print(dataset, flush=True) # shuffle the data dataset.reorder(np.random.permutation(dataset.shape[0]), 0) dataset.reorder(np.random.permutation(dataset.shape[1]), 1) # partition the data if partition_axis == 1: print('PARTITIONING TRANSPOSE...', flush=True) create_and_save_partitions(dataset.totranspose(), study_name, meta_label, test_groups, pretest_groups, valid_groups, save_text_files) elif partition_axis == 0: print('PARTITIONING ORIGINAL...', flush=True) create_and_save_partitions(dataset, study_name, meta_label, test_groups, pretest_groups, valid_groups, save_text_files) else: raise ValueError('invalid partition_axis')
def main(): # load class examples print('loading class examples...', flush=True) class_examples_folder = 'targets/pharmaprojects' class_examples = { 'positive': datasetIO.load_examples( '{0}/positive.txt'.format(class_examples_folder)), 'negative': datasetIO.load_examples( '{0}/negative.txt'.format(class_examples_folder)), 'unknown': datasetIO.load_examples( '{0}/unknown.txt'.format(class_examples_folder)) } # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/harmonizome/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/candidate_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) dataset_info['original_genes'] = gene_atb.shape[0] dataset_info['original_features'] = gene_atb.shape[1] # decide feature normalization print('deciding feature normalization...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' z-scoring features...', flush=True) dataset_info['feature_normalization'] = 'z-score' mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True) sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True) gene_atb.matrix = (gene_atb.matrix - mnv) / sdv gene_atb.columnmeta['mean'] = mnv.reshape(-1) gene_atb.columnmeta['stdv'] = sdv.reshape(-1) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' no feature normalization...', flush=True) dataset_info['feature_normalization'] = 'none' # assign class labels to genes print('assigning class labels to genes...', flush=True) gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0], 'unknown', dtype='object') gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['positive']))] = 'positive' gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['negative']))] = 'negative' # add dataset mean and stdv as features print('adding dataset mean and stdv as features...', flush=True) gene_stat = dataclasses.datamatrix( rowname=gene_atb.rowname, rowlabels=gene_atb.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb.rowmeta), columnname=gene_atb.columnname, columnlabels=np.array(['mean', 'stdv'], dtype='object'), columnmeta={}, matrixname=gene_atb.matrixname, matrix=np.append(gene_atb.matrix.mean(1, keepdims=True), gene_atb.matrix.std(1, keepdims=True), 1)) gene_atb.append(gene_stat, 1) gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels, gene_stat.columnlabels) del gene_stat # identify features with little information about labelled examples print( 'identifying features with little information about labelled examples...', flush=True) isunknown = gene_atb.rowmeta['class'] == 'unknown' tobediscarded = np.logical_or.reduce( ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3, (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3, np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0))) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save candidate features print(' saving {0!s} candidate features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['candidate_genes'] = gene_atb.shape[0] dataset_info['candidate_features'] = gene_atb.shape[1] dataset_info['positive_examples'] = ( gene_atb.rowmeta['class'] == 'positive').sum() dataset_info['negative_examples'] = ( gene_atb.rowmeta['class'] == 'negative').sum() dataset_info['unknown_examples'] = ( gene_atb.rowmeta['class'] == 'unknown').sum() datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count): print('begin calc_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # convert counts to float print('converting counts to float...') term_term.matrix = np.float64(term_term.matrix) term_term.updatedtypeattribute() for field, values in term_term.rowmeta.items(): if values.dtype == np.int64: term_term.rowmeta[field] = np.float64(values) for field, values in term_term.columnmeta.items(): if values.dtype == np.int64: term_term.columnmeta[field] = np.float64(values) # set universe size print('setting universe size...') if universe == 'intersectionunion' or universe == 'union': universe_size = term_term.rowmeta['all_count_{0}'.format(universe)][0] elif universe == 'medline': universe_size = 1e8 # 3e7 term_term.rowmeta['term_count_medline'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_medline'] = term_term.columnmeta[ 'term_count_union'].copy() elif universe == 'infinity': universe_size = 1e16 term_term.rowmeta['term_count_infinity'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_infinity'] = term_term.columnmeta[ 'term_count_union'].copy() else: raise ValueError('invalid universe') # create matrices for select association statistics print('creating matrices for select association statistics...') selstats = ['mcc', 'mmcc', 'cos', 'mi', 'nmi', 'iqr'] statmats = {} for selstat in selstats: statmats[selstat] = np.zeros(term_term.shape, dtype='float64') # calculate association statistics and write to dataframe print('calculating association statistics and writing to dataframe...') dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) rowmetalabels = ['term_id', 'term_name'] rowmetaheaders = [ '{0}_id'.format(row_dictionary), '{0}_name'.format(row_dictionary) ] columnmetalabels = ['term_id', 'term_name'] columnmetaheaders = [ '{0}_id'.format(column_dictionary), '{0}_name'.format(column_dictionary) ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'mmcc', 'cos', 'fnlp', 'sig', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr' ] with gzip.open(dataframe_path, mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = ['{0}_dictidname'.format(row_dictionary) ] + rowmetaheaders + [ '{0}_dictidname'.format(column_dictionary) ] + columnmetaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for k, (i, j) in enumerate(zip(I, J)): if np.mod(k, 1000) == 0 or k + 1 == num_sufficient: print('working on term-term pair {0!s} of {1!s}...'.format( k + 1, num_sufficient)) # confusion matrix tp = term_term.matrix[i, j] fp = term_term.rowmeta['term_count_{0}'.format(universe)][i] - tp fn = term_term.columnmeta['term_count_{0}'.format( universe)][j] - tp tn = universe_size - (tp + fp + fn) # incorporate a random prior with effective sample size = n_prior, # where prior distribution conforms to empirical marginal distributions Rr = (tp + fp) / (fn + tn) # ratio of rows of confusion matrix Rc = (tp + fn) / (fp + tn) # ratio of columns of confusion matrix tp_prior = n_prior * Rc * Rr / ( Rc * Rr + Rr + Rc + 1 ) # solve for tp given constraints tp/fn=Rr, fp/tn=Rr, tp/fp=Rc, fn/tn=Rc, tp+fp+fn+tn=n_eff fp_prior = tp_prior / Rc fn_prior = tp_prior / Rr tn_prior = tp_prior / Rc / Rr tp += tp_prior fp += fp_prior fn += fn_prior tn += tn_prior ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) mmcc = 1 - np.sqrt( (fp * fn) / ((tp + fp) * (tp + fn)) ) # modified (by me), equivalent to 1 + mcc with tn forced to 0 cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) sig = fnlp > np.log10(term_term.size) - np.log10(0.05) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(fn) + np.log10( fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, mmcc, cos, fnlp, sig, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr ] rowwritelist = [term_term.rowlabels[i]] + [ term_term.rowmeta[l][i] if term_term.rowmeta[l].dtype == 'object' else str(term_term.rowmeta[l][i]) for l in rowmetalabels ] columnwritelist = [term_term.columnlabels[j]] + [ term_term.columnmeta[l][j] if term_term.columnmeta[l].dtype == 'object' else str(term_term.columnmeta[l][j]) for l in columnmetalabels ] writelist = rowwritelist + columnwritelist + [ str(s) for s in count_stats ] + ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') statmats['mcc'][i, j] = mcc statmats['mmcc'][i, j] = mmcc statmats['cos'][i, j] = cos statmats['mi'][i, j] = mi statmats['nmi'][i, j] = nmi statmats['iqr'][i, j] = iqr # save matrices for select association statistics print('saving matrices for select association statistics...') for selstat in selstats: term_term.matrix = statmats[selstat] datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.txt.gz' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) print('done calc_term-term_stats_from_termite.py')
import sys sys.path.append('../../utilities') import numpy as np import pandas as pd import copy import datasetIO import os import shutil from dataclasses import datamatrix as DataMatrix # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical.pickle') print(dataset, flush=True) # discard samples print('discarding samples...', flush=True) dataset.discard(dataset.rowmeta['irrecist'] == 'stable disease', 0) print(dataset, flush=True) # save the data print('saving data...', flush=True) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.pickle', dataset) datasetIO.save_datamatrix('../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared_plus_clinical_no_stabledisease.txt.gz', dataset) savefolder = '../../input_data/pratfelip_transposed_plus_clinical_no_stabledisease' if not os.path.exists(savefolder): os.makedirs(savefolder) datasetIO.save_splitdata(savefolder, dataset)
import sys sys.path.append('../../utilities') import numpy as np import copy import datasetIO import os import shutil from collections import defaultdict # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix( '../../original_data/hugolo_symlnk/patient_gene_hugolo_rnaseq_prepared.pickle' ) rgep_path = { 'pfprior': '../../original_data/rgeps_symlnk/PratFelip_RGEPs.txt', 'pfleak': '../../original_data/rgeps_symlnk/PratFelip_information_leak_RGEPs.txt', 'knowledge_based': '../../original_data/rgeps_symlnk/GSK_knowledge_based_RGEPs.txt', 'nanostring': '../../original_data/rgeps_symlnk/Nanostring_RGEPs.txt', 'melanoma_single_cell': '../../original_data/rgeps_symlnk/TiroshGarraway_melanoma_single_cell_RGEPs.txt' }
def main(validation_rep=0, validation_fold=0): # load target clusters print('loading target cluster assignments...', flush=True) target_cluster_path = 'targets/clusters/gene_cluster_byfamily.pickle' gene_cluster = datasetIO.load_clusterassignments(target_cluster_path) # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/nonredundant_features/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/generalizable_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify feature generalizability test parameters print('specifying feature generalizability test parameters...', flush=True) dataset_info[ 'feature_generalizability_test_function'] = featureselection.univariate_grouppreserved_permtest dataset_info[ 'feature_generalizability_test_permutations'] = 10000 # 100000 dataset_info[ 'feature_generalizability_test_targetclusterpath'] = target_cluster_path dataset_info[ 'multiple_hypothesis_testing_correction_function'] = featureselection.multiple_hypothesis_testing_correction dataset_info[ 'multiple_hypothesis_testing_correction_method'] = 'fdr_by' dataset_info['multiple_hypothesis_testing_correction_threshold'] = 0.05 print(' feature_generalizability_test_function: {0}'.format( dataset_info['feature_generalizability_test_function']), flush=True) print(' feature_generalizability_test_permutations: {0!s}'.format( dataset_info['feature_generalizability_test_permutations']), flush=True) print(' feature_generalizability_test_targetclusterpath: {0}'.format( dataset_info['feature_generalizability_test_targetclusterpath']), flush=True) print(' multiple_hypothesis_testing_correction_function: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_function']), flush=True) print(' multiple_hypothesis_testing_correction_method: {0}'.format( dataset_info['multiple_hypothesis_testing_correction_method']), flush=True) print(' multiple_hypothesis_testing_correction_threshold: {0!s}'. format(dataset_info[ 'multiple_hypothesis_testing_correction_threshold']), flush=True) # exclude validation and unlabeled examples from significance calculation print( 'excluding validation and unlabeled examples from significance calculation...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) # compute feature generalizability with multiple hypothesis testing correction print( 'computing feature generalizability with multiple hypothesis testing correction...', flush=True) gene_atb.rowmeta['cluster'] = np.array([ gene_cluster[g] if g in gene_cluster else -1 for g in gene_atb.rowlabels ], dtype='int64') gene_atb.columnmeta[ 'generalizability_test_statistic_values'], gene_atb.columnmeta[ 'generalizability_pvalues'] = dataset_info[ 'feature_generalizability_test_function']( X=gene_atb.matrix[istraintest, :], Y=(gene_atb.rowmeta['class'][istraintest] == 'positive' ), G=gene_atb.rowmeta['cluster'][istraintest], numperm=dataset_info[ 'feature_generalizability_test_permutations']) gene_atb.columnmeta['is_generalizable'], gene_atb.columnmeta[ 'generalizability_pvalues_corrected'] = dataset_info[ 'multiple_hypothesis_testing_correction_function']( gene_atb.columnmeta['generalizability_pvalues'], alpha=dataset_info[ 'multiple_hypothesis_testing_correction_threshold'], method=dataset_info[ 'multiple_hypothesis_testing_correction_method']) gene_atb.columnmeta['generalizability_correlation_sign'] = np.sign( gene_atb.columnmeta['generalizability_test_statistic_values']) if (gene_atb.columnmeta['generalizability_pvalues'] < 1 / dataset_info['feature_generalizability_test_permutations'] ).any(): print( ' warning: not enough permutations to establish all pvalues...', flush=True) tobediscarded = np.logical_or( np.isnan(gene_atb.columnmeta['generalizability_pvalues']), np.isnan( gene_atb.columnmeta['generalizability_pvalues_corrected'])) if tobediscarded.any(): gene_atb.discard(tobediscarded, axis=1) # prioritize features print('prioritizing features...', flush=True) sortedindices = np.argsort( gene_atb.columnmeta['generalizability_pvalues_corrected']) gene_atb.reorder(sortedindices, axis=1) # save feature generalizability info print('saving feature generalizability info...', flush=True) with open('{0}/{1}_feature_generalizability_info.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = [ 'dataset', 'abbreviation', 'feature', 'generalizability_test_statistic', 'generalizability_pvalue', 'generalizability_pvalue_corrected', 'is_generalizable', 'generalizability_correlation_sign', 'preferred_rowstat', 'similar_features' ] fw.write('\t'.join(writelist) + '\n') for j, feature in enumerate(gene_atb.columnlabels): writelist = [ dataset_info['name'], dataset_info['abbreviation'], feature, '{0:1.5g}'.format(gene_atb.columnmeta[ 'generalizability_test_statistic_values'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['generalizability_pvalues'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_pvalues_corrected'][j]), '{0:1.5g}'.format( gene_atb.columnmeta['is_generalizable'][j]), '{0:1.5g}'.format( gene_atb. columnmeta['generalizability_correlation_sign'][j]), gene_atb.columnmeta['preferred_rowstat'][j], gene_atb.columnmeta['similar_features'][j] ] fw.write('\t'.join(writelist) + '\n') # discard features that are not generalizable print('discarding features that are not generalizable...', flush=True) tobediscarded = ~gene_atb.columnmeta['is_generalizable'] if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save generalizable features print(' saving {0!s} generalizable features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['generalizable_genes'] = gene_atb.shape[0] dataset_info['generalizable_features'] = gene_atb.shape[1] dataset_info[ 'feature_generalizability_test_function'] = 'featureselection.univariate_grouppreserved_permtest' dataset_info[ 'multiple_hypothesis_testing_correction_function'] = 'featureselection.multiple_hypothesis_testing_correction' datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
print('valid') print(valid) print('test') print(test) datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'train'), train) datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'valid'), valid) datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'test'), test) ''' folder_name = 'GTEXv6' source_path = 'data/prepared_data/{0}/fat'.format(folder_name) target_path = 'data/prepared_data/{0}_tsub/fat'.format(folder_name) os.makedirs(target_path) os.makedirs(target_path.replace('data/prepared_data', 'results/autoencoder')) train = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( source_path, 'train')) tobediscarded = train.rowmeta['general_tissue'] == '-666' train.discard(tobediscarded, 0) Y = train.matrix.copy() l = train.rowmeta['general_tissue'].copy() L = np.unique(l) X = np.float64(l.reshape(-1, 1) == L.reshape(1, -1)) X = np.append(X, np.ones((X.shape[0], 1), dtype='float64'), 1) B, _, rank, singular_values = np.linalg.lstsq(X, Y, rcond=None) Ypred = X.dot(B) train.matrix = Y - Ypred datasetIO.save_datamatrix('{0}/{1}.pickle'.format(target_path, 'train'), train) valid = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( source_path, 'valid')) tobediscarded = valid.rowmeta['general_tissue'] == '-666'
import sys sys.path.append('../../utilities') import numpy as np import copy import datasetIO import os import shutil from collections import defaultdict from dataclasses import datamatrix as DataMatrix # load the data print('loading dataset...', flush=True) dataset = datasetIO.load_datamatrix( '../../original_data/pratfelip_symlnk/patient_gene_pratfelip_nanostring_prepared.pickle' ) dataset.columnmeta[dataset.columnname] = dataset.columnlabels.copy() dataset.columnname += '_or_clinical_variable' dataset.columnmeta['variable_type'] = np.full(dataset.shape[1], 'gene', dtype='object') dataset.columnmeta['is_gene'] = np.ones(dataset.shape[1], dtype='bool') dataset.columnmeta['is_clinical_variable'] = np.zeros(dataset.shape[1], dtype='bool') print(dataset, flush=True) # create datamatrix of clinical variables print('creating datamatrix of clinical variables...', flush=True) clinical_variables = [ 'age_50to59', 'age_60to69', 'age_70to76', 'age_79to83', 'gender_female',
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, reference_datamatrix_path, save_predictions): print('begin benchmark_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) print('association_statistic: {0}'.format(association_statistic)) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print('save_predictions: {0!s}'.format(save_predictions)) # create figures folder print('creating figures folder...') figures_folder = 'benchmark_figures' if not os.path.exists(figures_folder): os.mkdir(figures_folder) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term_counts_all = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term_counts_all) # load association statistic datamatrix # this file is generated by calc_term-term_stats_from_termite.py print('loading association statistic datamatrix...') stats_datamatrix_path = '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'.format( row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count) term_term_stats_all = datasetIO.load_datamatrix(stats_datamatrix_path) print('stats_datamatrix_path: {0}'.format(stats_datamatrix_path)) print(term_term_stats_all) # load reference datamatrix of positive and negative examples print('loading reference datamatrix of positive and negative examples...') term_term_ref = datasetIO.load_datamatrix(reference_datamatrix_path) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print(term_term_ref) # align datamatrices to reference print('aligning datamatrices to reference...') term_term_counts = term_term_counts_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) term_term_stats = term_term_stats_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term_counts.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # find row_term_dicts and column_term_dicts print('finding row_term_dicts and column_term_dicts') row_term_dicts = np.unique(term_term_stats.rowmeta['term_dict']) column_term_dicts = np.unique(term_term_stats.columnmeta['term_dict']) # calculate performance on reference examples and write to dataframe print( 'calculating performance on reference examples and writing to dataframe...' ) dataframe_path = 'benchmark_term-term_stats_dataframe.txt' metaheaders = [ 'row_dictionary', 'column_dictionary', 'year', 'datestamp', 'min_score', 'universe', 'n_prior', 'min_count', 'association_statistic', 'reference_datamatrix_path', 'row_term_dict', 'column_term_dict' ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'auroc', 'auprc', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'cos', 'fnlp', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr', 'min_value_association_statistic' ] with open(dataframe_path, mode='at', encoding='utf-8', errors='surrogateescape') as fw: writelist = metaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for row_term_dict in row_term_dicts: row_hidxs = (term_term_stats.rowmeta['term_dict'] == row_term_dict ).nonzero()[0] for column_term_dict in column_term_dicts: print('working on {0}-{1} associations...'.format( row_term_dict, column_term_dict)) # get scores and labels print('getting scores and labels...') column_hidxs = (term_term_stats.columnmeta['term_dict'] == column_term_dict).nonzero()[0] hit = np.logical_and(np.in1d(I, row_hidxs), np.in1d(J, column_hidxs)) Y = term_term_ref.matrix[I[hit], J[hit]] X = (term_term_stats.matrix[I[hit], J[hit]]).reshape(-1, 1) X_prime = X.copy() if association_statistic == 'mcc': X_prime = (X_prime + 1) / 2 xpmin = (X_prime[X_prime > 0]).min() / 2 xpmax = 1 - (1 - (X_prime[X_prime < 1]).max()) / 2 X_prime[X_prime == 0] = xpmin X_prime[X_prime == 1] = xpmax logitX = np.log10(X_prime / (1 - X_prime)) # save score histograms print('saving score histograms...') values = X.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = logitX.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # fit logistic regression classifier print('fitting logistic regression classifier...') robust_scaler = RobustScaler().fit(logitX) Z = robust_scaler.transform(logitX) logistic_regression_model = LogisticRegression( penalty='l2', C=1e3, intercept_scaling=1.0, class_weight='balanced').fit(Z, Y) if logistic_regression_model.classes_[1] == 1: decision_function = logistic_regression_model.decision_function( Z) else: decision_function = -logistic_regression_model.decision_function( Z) Y_pred = decision_function > 0 min_value_association_statistic = (X.reshape(-1)[Y_pred]).min() # save decision function and predicted probability histograms print( 'saving decision function and predicted probability histograms...' ) values = decision_function.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = (1 / (1 + np.exp(-decision_function))).reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False) save_path = '{0}/{1}_{2}_zoomhist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # compute roc and pr curves print('computing roc and pr curves...') fpr, tpr, thresholds = roc_curve(Y, decision_function) precision, recall, thresholds = precision_recall_curve( Y, decision_function) auroc = roc_auc_score(Y, decision_function) auprc = average_precision_score(Y, decision_function) # save roc and pr curves print('saving roc and pr curves...') title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auprc) save_path = '{0}/{1}_{2}_prc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(recall, precision, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auroc) save_path = '{0}/{1}_{2}_roc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(fpr, tpr, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() # save predictions for all term-term pairs if save_predictions: print('saving predictions for all term-term pairs...') predictions = {} X_all = term_term_stats_all.matrix.reshape(-1, 1) if association_statistic == 'mcc': X_all = (X_all + 1) / 2 xamin = (X_all[X_all > 0]).min() / 2 xamax = 1 - (1 - (X_all[X_all < 1]).max()) / 2 X_all[X_all == 0] = xamin X_all[X_all == 1] = xamax logitX_all = np.log10(X_all / (1 - X_all)) Z_all = robust_scaler.transform(logitX_all) if logistic_regression_model.classes_[1] == 1: predictions[ 'decision_function'] = logistic_regression_model.decision_function( Z_all) else: predictions[ 'decision_function'] = -logistic_regression_model.decision_function( Z_all) predictions['probability_positive'] = 1 / ( 1 + np.exp(-predictions['decision_function'])) if not np.all(np.diff(thresholds) > 0): raise ValueError('thresholds not increasing') predictions['precision'] = np.interp( predictions['decision_function'], thresholds, precision[:-1]) predictions['recall'] = np.interp( predictions['decision_function'], thresholds, recall[:-1]) I0, J0 = (term_term_counts_all.matrix < min_count).nonzero() IA, JA = (term_term_counts_all.matrix >= min_count).nonzero() new_stats = [ '{0}_dictidname'.format(row_dictionary), '{0}_dictidname'.format(column_dictionary) ] new_stat_mat = np.concatenate( (term_term_counts_all.rowlabels[IA].reshape(-1, 1), term_term_counts_all.columnlabels[JA].reshape(-1, 1)), 1) for stat, values in predictions.items(): term_term_stats_all.matrix = values.reshape( term_term_stats_all.shape[0], term_term_stats_all.shape[1]) term_term_stats_all.matrix[I0, J0] = 0 datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.txt.gz' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.pickle' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) new_stats.append(stat) new_stat_mat = np.append( new_stat_mat, (term_term_stats_all.matrix[IA, JA]).reshape(-1, 1), 1) new_df = pd.DataFrame(data=new_stat_mat, columns=new_stats) dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) joined_dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}_as_{8}_rd_{9}_cd_{10}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict) df = pd.read_table(dataframe_path, compression='gzip', index_col=False) joined_df = df.set_index(new_stats[:2]).join( new_df.set_index(new_stats[:2])) joined_df.sort_values(by=association_statistic, ascending=False, inplace=True) joined_df.to_csv(joined_dataframe_path, sep='\t', compression='gzip') # compute classifier performance statistics # note, these are in-sample statistics # we are not worried about overfitting # because we only have one feature # and we are not trying to build a rigorous ML model # we are simply trying to answer the question, # given a reference set of positive and negative examples, # which association statistic ranks term-term pairs the best? print('computing classifier performance statistics...') tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel() # incorporate a random prior with effective sample size = n_prior prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_prior * prevalence / 2 fn += n_prior * prevalence / 2 tn += n_prior * (1 - prevalence) / 2 fp += n_prior * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10( fn) + np.log10(fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio # write to dataframe print('writing to dataframe...') count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ auroc, auprc, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, cos, fnlp, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr, min_value_association_statistic ] writelist = [ row_dictionary, column_dictionary, year, datestamp, str(min_score), universe, str(n_prior), str(min_count), association_statistic, reference_datamatrix_path, row_term_dict, column_term_dict ] writelist += [str(s) for s in count_stats] writelist += ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') print('done benchmark_term-term_stats_from_termite.py')
""" @author: ar988996 """ import sys sys.path.append('../../utilities') import numpy as np import datasetIO import os # load the data print('loading datamatrix 1...', flush=True) dm1_name = 'GTEXv6' dm1_likelihood = 'normal' dm1 = datasetIO.load_datamatrix('../../original_data/GTEXv6/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts_prepared.pickle') dm1.rowmeta['row_mean_ref'] = dm1.rowmeta['row_mean_ref'].reshape(-1) # must be 1d to be compatible with tolabels method later dm1.rowmeta['row_stdv_ref'] = dm1.rowmeta['row_stdv_ref'].reshape(-1) # must be 1d to be compatible with tolabels method later print(dm1, flush=True) print('loading datamatrix 2...', flush=True) dm2_name = 'impc' dm2_likelihood = 'bernoulli' dm2 = datasetIO.load_datamatrix('../../original_data/impc/gene_phenotype_impc_trimmed_thresholded_propagated_prepared.pickle') print(dm2, flush=True) # align rows print('aligning rows...', flush=True) u_rowlabels = np.union1d(dm1.rowlabels, dm2.rowlabels) i_rowlabels = np.intersect1d(dm1.rowlabels, dm2.rowlabels) in_dm1 = np.in1d(u_rowlabels, dm1.rowlabels) in_dm2 = np.in1d(u_rowlabels, dm2.rowlabels)
def main(project_name, hyperparameters, evaluation_statistics, selection_criteria, sigma_multipliers): min_num_hp_combinations = 100 num_gp_optimizer_restarts = 0 # 4 outlier_sigma_multiplier = 6 xline = np.linspace(0, 1, 100, dtype='float64') yline = np.linspace(0, 1, 100, dtype='float64') xmat, ymat = np.meshgrid(xline, yline) Xarr = np.append(xmat.reshape(-1,1), ymat.reshape(-1,1), 1) fxy = 2*Xarr[:,0]*Xarr[:,1]/(Xarr[:,0] + Xarr[:,1] + 1e-6) si = np.argsort(fxy) fxy = fxy[si] Xarr = Xarr[si,:] grid_indices = np.argsort(si) kernel = SumKernel(WhiteKernel(noise_level=1.0, noise_level_bounds=(1e-6, 1e3)), ProductKernel(ConstantKernel(constant_value=1.0, constant_value_bounds=(1e-6, 1e3)), RBFKernel(length_scale=np.array([1.0, 1.0], dtype='float64'), length_scale_bounds=(1e-2, 1e2)))) project_folder = '../../hp_search/{0}'.format(project_name) print('project: {0}...'.format(project_name), flush=True) print('project_folder: {0}...'.format(project_folder), flush=True) search_folders = ['{0}/{1}'.format(project_folder, f) for f in os.listdir(project_folder) if f[:10] == 'hp_search_'] search_ids = [int(f.rsplit('_', maxsplit=1)[-1]) for f in search_folders] print('found {0!s} search folders.'.format(len(search_folders)), flush=True) for search_id, search_folder in zip(search_ids, search_folders): print('working on search_folder: {0}...'.format(search_folder), flush=True) search_data_path = '{0}/hp_search_data.txt'.format(search_folder) search_data_path_with_stats = '{0}/hp_search_data_with_performance_stats.txt'.format(search_folder) print('search_data_path: {0}'.format(search_data_path), flush=True) if os.path.exists(search_data_path) and os.path.getsize(search_data_path) > 0: print('loading search data...', flush=True) df = pd.read_table(search_data_path, index_col=False) if df.shape[0] >= min_num_hp_combinations: print('appending performance stats...', flush=True) if os.path.exists(search_data_path_with_stats) and os.path.getsize(search_data_path) > 0: df = pd.read_table(search_data_path_with_stats, index_col=False) else: for stage in ['validation', 'testing']: print('working on {0} stage...'.format(stage), flush=True) for rowidx, combination_id in enumerate(df.combination_id): combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id) performance_data_path = '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(combination_folder, stage) if os.path.exists(performance_data_path): stat_subset = datasetIO.load_datamatrix(performance_data_path) if 'stat_mat' not in locals(): stat_mat = np.full((df.shape[0], stat_subset.size), np.nan, dtype='float64') stat_cols = (stage + '_' + stat_subset.rowlabels.reshape(-1,1) + '_' + stat_subset.columnlabels.reshape(1,-1)).reshape(-1) stat_mat[rowidx,:] = stat_subset.matrix.reshape(-1) stat_df = pd.DataFrame(data=stat_mat, columns=stat_cols) stat_df['combination_id'] = df.combination_id.values df = df.set_index('combination_id').join(stat_df.set_index('combination_id')).reset_index() del stat_mat, stat_cols, stat_df df.to_csv(search_data_path_with_stats, sep='\t', index=False) if '{0}_search_domain'.format(hyperparameters[0]) not in df.columns: df['{0}_search_domain'.format(hyperparameters[0])] = 0.5 if '{0}_search_domain'.format(hyperparameters[1]) not in df.columns: df['{0}_search_domain'.format(hyperparameters[1])] = 0.5 if '{0}_model_space'.format(hyperparameters[0]) not in df.columns: df['{0}_model_space'.format(hyperparameters[0])] = 1 if '{0}_model_space'.format(hyperparameters[1]) not in df.columns: df['{0}_model_space'.format(hyperparameters[1])] = 1 for evaluation_statistic in evaluation_statistics: print('working on performance evaluation statistic: {0}...'.format(evaluation_statistic), flush=True) C = df['combination_id'].values Y_fit = df['validation_{0}_fit'.format(evaluation_statistic)].values Y_fit = np.log10(Y_fit/(1-Y_fit)) Y_predict = df['validation_{0}_predict'.format(evaluation_statistic)].values Y_predict = np.log10(Y_predict/(1-Y_predict)) Y_diff = Y_fit - Y_predict X_1 = df['{0}_search_domain'.format(hyperparameters[0])].values X_2 = df['{0}_search_domain'.format(hyperparameters[1])].values keep = np.isfinite(np.concatenate((Y_fit.reshape(-1,1), Y_predict.reshape(-1,1), Y_diff.reshape(-1,1), X_1.reshape(-1,1), X_2.reshape(-1,1)), 1)).all(1) C = C[keep] Y_fit = Y_fit[keep] Y_predict = Y_predict[keep] Y_diff = Y_diff[keep] X_1 = X_1[keep] X_2 = X_2[keep] X = np.append(X_1.reshape(-1,1), X_2.reshape(-1,1), 1) print('fitting Y_predict...', flush=True) is_outlier = np.zeros(Y_predict.size, dtype='bool') prev_outliers = -1 curr_outliers = 0 num_fits = 0 while curr_outliers - prev_outliers > 0 and not is_outlier.all(): gp_predict = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_predict[~is_outlier]) Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(X, return_std=True) is_outlier = np.abs(Y_predict - Y_predict_hat_mean) > outlier_sigma_multiplier*Y_predict_hat_stdv prev_outliers = curr_outliers curr_outliers = is_outlier.sum() num_fits += 1 print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True) Y_predict_hat_mean, Y_predict_hat_stdv = gp_predict.predict(Xarr, return_std=True) plt.imsave('{0}/{1}_predict_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) plt.imsave('{0}/{1}_predict_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_predict_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) print('fitting Y_diff...', flush=True) is_outlier = np.zeros(Y_diff.size, dtype='bool') prev_outliers = -1 curr_outliers = 0 num_fits = 0 while curr_outliers - prev_outliers > 0 and not is_outlier.all(): gp_diff = GaussianProcessRegressor(kernel=kernel, alpha=0, n_restarts_optimizer=num_gp_optimizer_restarts, normalize_y=True).fit(X[~is_outlier,:], Y_diff[~is_outlier]) Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(X, return_std=True) is_outlier = np.abs(Y_diff - Y_diff_hat_mean) > outlier_sigma_multiplier*Y_diff_hat_stdv prev_outliers = curr_outliers curr_outliers = is_outlier.sum() num_fits += 1 print('num_fits', num_fits, 'curr_outliers', curr_outliers, 'prev_outliers', prev_outliers, flush=True) Y_diff_hat_mean, Y_diff_hat_stdv = gp_diff.predict(Xarr, return_std=True) plt.imsave('{0}/{1}_diff_hat_mean_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_mean[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) plt.imsave('{0}/{1}_diff_hat_stdv_4.png'.format(search_folder, evaluation_statistic), Y_diff_hat_stdv[grid_indices].reshape(xmat.shape[0], xmat.shape[1])) for selection_criterion in selection_criteria: print('working on selection criterion: {0}...'.format(selection_criterion), flush=True) for sigma_multiplier in sigma_multipliers: print('working on sigma multiplier: {0}...'.format(sigma_multiplier), flush=True) if selection_criterion == 'optimistic_max': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit])) Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean() hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) if not hit2.any(): hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) hit = hit2 # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_max': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # among these hits, find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean[hit])) Y_diff_hat_stdv_min = Y_diff_hat_stdv[hit][np.min(np.abs(Y_diff_hat_mean[hit])) == Y_diff_hat_mean_min].mean() hit2 = np.logical_and(hit, np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) if not hit2.any(): hit2 = np.logical_and(hit, (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6)) hit = hit2 # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_match': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean() hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_match': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # among these hits, find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean[hit].max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[hit][Y_predict_hat_mean[hit] == Y_predict_hat_mean_max].mean() hit = np.logical_and(hit, (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6)) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_max_0': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_max_0': # find hp combinations where Y_predict_hat_mean_max is within confidence interval of Y_predict_hat_mean Y_predict_hat_mean_max = Y_predict_hat_mean.max() Y_predict_hat_stdv_max = Y_predict_hat_stdv[Y_predict_hat_mean == Y_predict_hat_mean_max].mean() hit = (Y_predict_hat_mean_max - Y_predict_hat_mean) <= (sigma_multiplier*Y_predict_hat_stdv + 1e-6) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] elif selection_criterion == 'optimistic_match_0': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6 if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # choose least regularized hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][-1] elif selection_criterion == 'conservative_match_0': # find hp combinations where zero, or if no hits then Y_diff_hat_mean_min, is within confidence interval of Y_diff_hat_mean Y_diff_hat_mean_min = np.min(np.abs(Y_diff_hat_mean)) Y_diff_hat_stdv_min = Y_diff_hat_stdv[np.min(np.abs(Y_diff_hat_mean)) == Y_diff_hat_mean_min].mean() hit = np.abs(Y_diff_hat_mean) <= sigma_multiplier*Y_diff_hat_stdv + 1e-6 if not hit.any(): hit = (np.abs(Y_diff_hat_mean) - Y_diff_hat_mean_min) <= (sigma_multiplier*Y_diff_hat_stdv + 1e-6) # choose simplest hp combination among the hits (***assumes lower index corresponds to simpler model***) fxy_max = fxy[hit].max() hit = np.logical_and(hit, (fxy_max - fxy) <= 1e-6) hidx = hit.nonzero()[0][0] else: raise ValueError('invalid selection_criterion') X_1_hit, X_2_hit = Xarr[hidx,:] d2 = (df['{0}_search_domain'.format(hyperparameters[0])].values - X_1_hit)**2 + (df['{0}_search_domain'.format(hyperparameters[1])].values - X_2_hit)**2 selidx = np.argmin(d2) combination_id = df['combination_id'][selidx] combination_folder = '{0}/hp_combination_{1!s}'.format(search_folder, combination_id) selected_df = df[df.combination_id == combination_id].copy() selected_df['search_id'] = search_id selected_df['evaluation_statistic'] = evaluation_statistic selected_df['selection_criterion'] = selection_criterion selected_df['sigma_multiplier'] = sigma_multiplier selected_df['Y_diff_hat_stdv_min'] = Y_diff_hat_stdv_min selected_df['Y_diff_hat_mean_min'] = Y_diff_hat_mean_min selected_df['Y_predict_hat_mean_max'] = Y_predict_hat_mean_max selected_df['Y_predict_hat_stdv_max'] = Y_predict_hat_stdv_max selected_df['Y_predict_hat_stdv_hit'] = Y_predict_hat_stdv[hidx] selected_df['Y_predict_hat_mean_hit'] = Y_predict_hat_mean[hidx] selected_df['Y_diff_hat_stdv_hit'] = Y_diff_hat_stdv[hidx] selected_df['Y_diff_hat_mean_hit'] = Y_diff_hat_mean[hidx] selected_df['X_1_hit'] = X_1_hit selected_df['X_2_hit'] = X_2_hit kernel_params = gp_predict.kernel_.get_params() selected_df['kernel_noise_stdv'] = np.sqrt(kernel_params['k1__noise_level']) selected_df['kernel_amplitude'] = kernel_params['k2__k1__constant_value'] selected_df['kernel_X_1_length_scale'], selected_df['kernel_X_2_length_scale'] = kernel_params['k2__k2__length_scale'] print('Y_predict_hat_mean_max: {0:1.3g}'.format(selected_df['Y_predict_hat_mean_max'].values[0]), flush=True) print('Y_predict_hat_stdv_max: {0:1.3g}'.format(selected_df['Y_predict_hat_stdv_max'].values[0]), flush=True) print('kernel_noise_stdv: {0:1.3g}'.format(selected_df['kernel_noise_stdv'].values[0]), flush=True) print('kernel_amplitude: {0:1.3g}'.format(selected_df['kernel_amplitude'].values[0]), flush=True) print('kernel_X_1_length_scale: {0:1.3g}'.format(selected_df['kernel_X_1_length_scale'].values[0]), flush=True) print('kernel_X_2_length_scale: {0:1.3g}'.format(selected_df['kernel_X_2_length_scale'].values[0]), flush=True) print('selected combination_id: {0!s}'.format(combination_id), flush=True) print('selected combination_folder: {0}'.format(combination_folder), flush=True) print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[0], selected_df['{0}_model_space'.format(hyperparameters[0])].values[0]), flush=True) print('selected {0}_model_space: {1:1.3g}'.format(hyperparameters[1], selected_df['{0}_model_space'.format(hyperparameters[1])].values[0]), flush=True) print('selected validation_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True) print('selected validation_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['validation_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True) print('selected testing_{0}_fit: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_fit'.format(evaluation_statistic)].values[0]), flush=True) print('selected testing_{0}_predict: {1:1.3g}'.format(evaluation_statistic, selected_df['testing_{0}_predict'.format(evaluation_statistic)].values[0]), flush=True) print('selected validation_ppv_fit: {0:1.3g}'.format(selected_df['validation_ppv_fit'].values[0]), flush=True) print('selected validation_ppv_predict: {0:1.3g}'.format(selected_df['validation_ppv_predict'].values[0]), flush=True) print('selected testing_ppv_fit: {0:1.3g}'.format(selected_df['testing_ppv_fit'].values[0]), flush=True) print('selected testing_ppv_predict: {0:1.3g}'.format(selected_df['testing_ppv_predict'].values[0]), flush=True) print('selected validation_tpr_fit: {0:1.3g}'.format(selected_df['validation_tpr_fit'].values[0]), flush=True) print('selected validation_tpr_predict: {0:1.3g}'.format(selected_df['validation_tpr_predict'].values[0]), flush=True) print('selected testing_tpr_fit: {0:1.3g}'.format(selected_df['testing_tpr_fit'].values[0]), flush=True) print('selected testing_tpr_predict: {0:1.3g}'.format(selected_df['testing_tpr_predict'].values[0]), flush=True) feature_weights_path = '{0}/iter_feature_datamatrix.txt.gz'.format(combination_folder) if os.path.exists(feature_weights_path) and os.path.getsize(feature_weights_path) > 0: iter_feature = datasetIO.load_datamatrix(feature_weights_path) iter_feature.rowmeta[iter_feature.rowname] = iter_feature.rowlabels.copy() iter_feature.rowmeta['combination_id'] = selected_df['combination_id'].values.copy() iter_feature.rowmeta['search_id'] = selected_df['search_id'].values.copy() iter_feature.rowmeta['evaluation_statistic'] = selected_df['evaluation_statistic'].values.copy() iter_feature.rowmeta['selection_criterion'] = selected_df['selection_criterion'].values.copy() iter_feature.rowmeta['sigma_multiplier'] = selected_df['sigma_multiplier'].values.copy() iter_feature.rowname = 'combination_id|search_id|evaluation_statistic|selection_criterion|sigma_multiplier' iter_feature.rowlabels = np.array(['{0!s}|{1!s}|{2}|{3}|{4!s}'.format(ci, si, es, sc, sm) for ci, si, es, sc, sm in zip(iter_feature.rowmeta['combination_id'], iter_feature.rowmeta['search_id'], iter_feature.rowmeta['evaluation_statistic'], iter_feature.rowmeta['selection_criterion'], iter_feature.rowmeta['sigma_multiplier'])], dtype='object') if 'feature_weights_dm' not in locals(): feature_weights_dm = iter_feature else: feature_weights_dm.append(iter_feature, 0) del iter_feature if 'collected_df' not in locals(): collected_df = selected_df else: collected_df = collected_df.append(selected_df, ignore_index=True) del selected_df else: print('missing combination data for search_id {0!s}. there are only {1!s} combinations'.format(search_id, df.shape[0]), flush=True) else: print('missing search data for search_id {0!s}'.format(search_id), flush=True) if np.mod(search_id, 10) == 0: collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False) datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm) collected_df.to_csv('{0}_selected_hyperparameters_gp_multi_4.csv'.format(project_name), index=False) datasetIO.save_datamatrix('{0}_selected_hyperparameters_gp_multi_feature_weights_4.txt.gz'.format(project_name), feature_weights_dm) print('done select_hyperparameters_gp.py', flush=True)
def main(d): # d is a dictionary containing the auto-encoder design specifications and training phase specifications # RESET DEFAULT GRAPH print('resetting default graph...', flush=True) tf.reset_default_graph() # FINISH CONFIGURATION print('finishing configuration...', flush=True) # specify distribution of initial weights if d['initialization_distribution'] == 'truncnorm': initialization_distribution = tf.truncated_normal # specify activation function if d['activation_function'] == 'tanh': activation_function = {'tf': tf.tanh, 'np': tsdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = { 'tf': tf.nn.relu, 'np': tsdae_apply_functions.relu } elif d['activation_function'] == 'elu': activation_function = { 'tf': tf.nn.elu, 'np': tsdae_apply_functions.elu } elif d['activation_function'] == 'sigmoid': activation_function = { 'tf': tf.sigmoid, 'np': tsdae_apply_functions.sigmoid } # load data partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( d['input_path'], partition)) d['{0}_examples'.format(partition)] = dataset[partition].shape[0] # get loss weights # we have features with mixed variable types and mixed missingness # strategy is to apply weights do the data points such that each feature has total weight of 1 # for binary features (columnmeta['likelihood'] == 'bernoulli'), balance the weight on the positive and negative classes # for other features, uniform weight zero = 0. half = 0.5 one = 1. posweights = 1 / 2 / (1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) posweights[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum( ~np.isnan(dataset['train']. matrix[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli']), 0, keepdims=True) negweights = 1 / 2 / ( 1 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True) - np.nansum(dataset['train'].matrix, 0, keepdims=True)) negweights[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] = 1 / np.sum( ~np.isnan(dataset['train']. matrix[:, dataset['train']. columnmeta['likelihood'] != 'bernoulli']), 0, keepdims=True) print('posweights nan:', np.isnan(posweights).any(), flush=True) print('negweights nan:', np.isnan(negweights).any(), flush=True) u_dataset, c_dataset = np.unique(dataset['train'].columnmeta['dataset'], return_counts=True) datasetweights = np.zeros((1, dataset['train'].shape[1]), dtype='float64') for dataset_name, dataset_count in zip(u_dataset, c_dataset): datasetweights[:, dataset['train'].columnmeta['dataset'] == dataset_name] = 1 / u_dataset.size / dataset_count # get parameters for marginal distributions # will sample from marginal distributions to impute missing values # as well as to replace known values with corrupted values # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli') # for other features, model as gaussian marginalprobabilities = ( 1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / ( 2 + np.sum(~np.isnan(dataset['train'].matrix), 0, keepdims=True) ) # posterior mean of beta-bernoulli with prior a=b=1 marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True) isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] == 'bernoulli').astype('float64').reshape(1, -1) print('marginalprobabilities nan:', np.isnan(marginalprobabilities).any(), flush=True) print('marginalstdvs nan:', np.isnan(marginalstdvs).any(), flush=True) print('isbernoullimarginal nan:', np.isnan(isbernoullimarginal).any(), flush=True) # assign friendly nan value nanvalue = -666.666 for partition in partitions: dataset[partition].matrix[np.isnan( dataset[partition].matrix)] = nanvalue # create output directory if not os.path.exists(d['output_path']): os.makedirs(d['output_path']) # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[ 'all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent d['batch_size'] = d['batch_fraction'] * d['train_examples'] batch_ids = create_batch_ids(d['train_examples'], d['batch_size']) d['batches'] = np.unique(batch_ids).size d['steps'] = d['current_epochs'] * d['batches'] # specify path to weights from previous training run d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['previous_hidden_layer'], d['previous_finetuning_run']) d['fix_or_init'] = 'fix' if d[ 'current_finetuning_run'] == 0 else 'init' # fix for pretraining, init for finetuning # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # print some design information print('input path: {0}'.format(d['input_path']), flush=True) print('output path: {0}'.format(d['output_path']), flush=True) print('previous variables path: {0}'.format(d['previous_variables_path']), flush=True) print('previous variables fix or init: {0}'.format(d['fix_or_init']), flush=True) # SAVE CURRENT DESIGN print('saving current design...', flush=True) with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(d, fw, indent=2) # DEFINE REPORTING VARIABLES print('defining reporting variables...', flush=True) reporting_steps = tsdae_design_functions.create_reporting_steps( d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint']) valid_losses = np.zeros(reporting_steps.size, dtype='float32') train_losses = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') valid_losses_normal = np.zeros(reporting_steps.size, dtype='float32') train_losses_normal = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses_normal = np.zeros(reporting_steps.size, dtype='float32') valid_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') train_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses_bernoulli = np.zeros(reporting_steps.size, dtype='float32') print('reporting steps:', reporting_steps, flush=True) # DEFINE COMPUTATIONAL GRAPH # define placeholders for input data, use None to allow feeding different numbers of examples print('defining placeholders...', flush=True) training = tf.placeholder(tf.bool, []) noise_prob = tf.placeholder(tf.float32, []) training_and_validation_data_initializer = tf.placeholder( tf.float32, [ dataset['train'].shape[0] + dataset['valid'].shape[0], dataset['train'].shape[1] ]) selection_mask = tf.placeholder( tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]]) pos_weights_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) neg_weights_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) dataset_weights_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) marginal_probabilities_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) marginal_stdvs_initializer = tf.placeholder(tf.float32, [1, dataset['train'].shape[1]]) is_bernoulli_marginal_initializer = tf.placeholder( tf.float32, [1, dataset['train'].shape[1]]) zero_initializer = tf.placeholder(tf.float32, []) half_initializer = tf.placeholder(tf.float32, []) one_initializer = tf.placeholder(tf.float32, []) nan_value_initializer = tf.placeholder(tf.float32, []) # define variables # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding print('defining variables...', flush=True) training_and_validation_data = tf.Variable( training_and_validation_data_initializer, trainable=False, collections=[]) pos_weights = tf.Variable(pos_weights_initializer, trainable=False, collections=[]) neg_weights = tf.Variable(neg_weights_initializer, trainable=False, collections=[]) dataset_weights = tf.Variable(dataset_weights_initializer, trainable=False, collections=[]) marginal_probabilities = tf.Variable(marginal_probabilities_initializer, trainable=False, collections=[]) marginal_stdvs = tf.Variable(marginal_stdvs_initializer, trainable=False, collections=[]) is_bernoulli_marginal = tf.Variable(is_bernoulli_marginal_initializer, trainable=False, collections=[]) zero_ = tf.Variable(zero_initializer, trainable=False, collections=[]) half_ = tf.Variable(half_initializer, trainable=False, collections=[]) one_ = tf.Variable(one_initializer, trainable=False, collections=[]) nan_value = tf.Variable(nan_value_initializer, trainable=False, collections=[]) if os.path.exists(d['previous_variables_path']): # update variables (if continuing from a previous training run) print('loading previous variables...', flush=True) global_step, W, bencode, bdecode = update_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma'], d['previous_variables_path'], d['fix_or_init'], d['include_global_step']) elif (d['current_hidden_layer'] == 1 and d['current_finetuning_run'] == 0) or d['skip_layerwise_training']: # create variables global_step, W, bencode, bdecode = create_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma']) else: raise ValueError('could not find previous variables') # define model # h contains the activations from input layer to bottleneck layer # hhat contains the activations from bottleneck layer to output layer # xhat is a reference to the output layer (i.e. the reconstruction) print('defining model...', flush=True) x = tf.boolean_mask(training_and_validation_data, selection_mask) is_positive = tf.to_float(tf.greater(x, zero_)) is_missing = tf.to_float(tf.equal(x, nan_value)) loss_weights = ( pos_weights * is_positive + neg_weights * (one_ - is_positive) ) * ( one_ - is_missing ) * dataset_weights # missing values won't be included in loss calculation loss_weights = loss_weights / tf.reduce_mean(loss_weights) normal_loss_weights = loss_weights * (one_ - is_bernoulli_marginal) bernoulli_loss_weights = loss_weights * is_bernoulli_marginal normal_noise = tf.truncated_normal(tf.shape(x), mean=zero_, stddev=one_) * marginal_stdvs bernoulli_noise = tf.to_float( tf.random_uniform(tf.shape(x), minval=zero_, maxval=one_) <= marginal_probabilities) noise = bernoulli_noise * is_bernoulli_marginal + normal_noise * ( one_ - is_bernoulli_marginal) random_noise_mask = tf.to_float( tf.random_uniform(tf.shape(x)) <= noise_prob ) # replace missing values and random fraction of known values with noise structured_noise_mask = tf.to_float( tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)], tf.to_int32(one_))) <= noise_prob) * tf.abs( tf.to_float( tf.random_uniform((tf.shape(x)[tf.to_int32(zero_)], tf.to_int32(one_))) <= half_) - is_bernoulli_marginal) noise_mask = random_noise_mask + structured_noise_mask - ( random_noise_mask * structured_noise_mask) x = x + is_missing * (noise - x) xnoisy = x + noise_mask * (noise - x) h, hhat, xhat_preactivation = create_autoencoder( xnoisy, activation_function['tf'], False, d['current_apply_activation_to_embedding'], d['use_batchnorm'], training, W, bencode, bdecode) # normal_loss = tf.squared_difference(x, xhat_preactivation) # bernoulli_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=x, logits=xhat_preactivation) # loss = tf.reduce_sum(loss_weights*(bernoulli_loss*is_bernoulli_marginal + normal_loss*(one_-is_bernoulli_marginal)))/tf.reduce_sum(loss_weights) normal_loss = tf.reduce_sum(normal_loss_weights * tf.squared_difference( x, xhat_preactivation)) / tf.reduce_sum(normal_loss_weights) bernoulli_loss = tf.reduce_sum( bernoulli_loss_weights * tf.nn.sigmoid_cross_entropy_with_logits( labels=x, logits=xhat_preactivation)) / tf.reduce_sum(bernoulli_loss_weights) loss = normal_loss + bernoulli_loss # define optimizer and training function print('defining optimizer and training function...', flush=True) optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'], epsilon=d['epsilon'], beta1=d['beta1'], beta2=d['beta2']) train_ops = optimizer.minimize(loss, global_step=global_step) # define update ops and add to train ops (if using batch norm) if d['use_batchnorm']: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_ops = [train_ops, update_ops] # collect batch norm variables if d['use_batchnorm']: bn_gammas = tf.global_variables( scope='batch_normalization.{0,2}/gamma:0') print(bn_gammas, flush=True) bn_betas = tf.global_variables( scope='batch_normalization.{0,2}/beta:0') bn_moving_means = tf.global_variables( scope='batch_normalization.{0,2}/moving_mean:0') bn_moving_variances = tf.global_variables( scope='batch_normalization.{0,2}/moving_variance:0') # define bottleneck layer preactivation # bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1] # INITIALIZE TENSORFLOW SESSION print('initializing tensorflow session...', flush=True) init = tf.global_variables_initializer() session_config = configure_session(d['processor'], d['gpu_memory_fraction']) with tf.Session(config=session_config) as sess: sess.run(init) # TRAINING print('training...', flush=True) sess.run(training_and_validation_data.initializer, feed_dict={ training_and_validation_data_initializer: np.append(dataset['train'].matrix, dataset['valid'].matrix, 0) }) sess.run(pos_weights.initializer, feed_dict={pos_weights_initializer: posweights}) sess.run(neg_weights.initializer, feed_dict={neg_weights_initializer: negweights}) sess.run(dataset_weights.initializer, feed_dict={dataset_weights_initializer: datasetweights}) sess.run(marginal_probabilities.initializer, feed_dict={ marginal_probabilities_initializer: marginalprobabilities }) sess.run(marginal_stdvs.initializer, feed_dict={marginal_stdvs_initializer: marginalstdvs}) sess.run( is_bernoulli_marginal.initializer, feed_dict={is_bernoulli_marginal_initializer: isbernoullimarginal}) sess.run(zero_.initializer, feed_dict={zero_initializer: zero}) sess.run(half_.initializer, feed_dict={half_initializer: half}) sess.run(one_.initializer, feed_dict={one_initializer: one}) sess.run(nan_value.initializer, feed_dict={nan_value_initializer: nanvalue}) validation_id = -1 batch_and_validation_ids = np.full(dataset['train'].shape[0] + dataset['valid'].shape[0], validation_id, dtype=batch_ids.dtype) is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'), np.zeros(dataset['valid'].shape[0], dtype='bool')) is_valid = ~is_train training_step = 0 i = 0 overfitting_score = 0 stopearly = False starttime = time.time() with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', buffering=1) as fl: fl.write('\t'.join([ 'step', 'train_loss', 'valid_loss', 'train_noisy_loss', 'valid_noisy_loss', 'train_loss_normal', 'valid_loss_normal', 'train_noisy_loss_normal', 'valid_noisy_loss_normal', 'train_loss_bernoulli', 'valid_loss_bernoulli', 'train_noisy_loss_bernoulli', 'valid_noisy_loss_bernoulli', 'time' ]) + '\n') for epoch in range(d['current_epochs']): if stopearly: break # randomize assignment of training examples to batches np.random.shuffle(batch_ids) batch_and_validation_ids[is_train] = batch_ids for batch in range(d['batches']): training_step += 1 # select mini-batch selected = batch_and_validation_ids == batch # update weights sess.run(train_ops, feed_dict={ training: True, selection_mask: selected, noise_prob: d['noise_probability'] }) # record training and validation errors if training_step == reporting_steps[i]: train_losses[i], train_losses_normal[ i], train_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_train, noise_prob: 0 }) train_noisy_losses[i], train_noisy_losses_normal[ i], train_noisy_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_train, noise_prob: d['noise_probability'] }) valid_losses[i], valid_losses_normal[ i], valid_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_valid, noise_prob: 0 }) valid_noisy_losses[i], valid_noisy_losses_normal[ i], valid_noisy_losses_bernoulli[i] = sess.run( [loss, normal_loss, bernoulli_loss], feed_dict={ training: False, selection_mask: is_valid, noise_prob: d['noise_probability'] }) print( 'step:{0:1.6g}, trn:{1:1.3g}, vld:{2:1.3g}, trnn:{3:1.3g}, vldn:{4:1.3g}, trnN:{5:1.3g}, vldN:{6:1.3g}, trnnN:{7:1.3g}, vldnN:{8:1.3g}, trnB:{9:1.3g}, vldB:{10:1.3g}, trnnB:{11:1.3g}, vldnB:{12:1.3g}, time:{13:1.6g}' .format(reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], train_losses_normal[i], valid_losses_normal[i], train_noisy_losses_normal[i], valid_noisy_losses_normal[i], train_losses_bernoulli[i], valid_losses_bernoulli[i], train_noisy_losses_bernoulli[i], valid_noisy_losses_bernoulli[i], time.time() - starttime), flush=True) fl.write('\t'.join([ '{0:1.6g}'.format(x) for x in [ reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], train_losses_normal[i], valid_losses_normal[i], train_noisy_losses_normal[i], valid_noisy_losses_normal[i], train_losses_bernoulli[i], valid_losses_bernoulli[i], train_noisy_losses_bernoulli[i], valid_noisy_losses_bernoulli[i], time.time() - starttime ] ]) + '\n') # save current weights, reconstructions, and projections if training_step >= d[ 'startsavingstep'] or training_step == reporting_steps[ -1]: with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(global_step), sess.run(W), sess.run(bencode), sess.run(bdecode)), fw) if d['use_batchnorm']: with open( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(bn_gammas), sess.run(bn_betas), sess.run(bn_moving_means), sess.run(bn_moving_variances)), fw) # stop early if overfitting if valid_losses[i] >= 1.01 * (np.insert( valid_losses[:i], 0, np.inf).min()): overfitting_score += 1 else: overfitting_score = 0 if overfitting_score == d['overfitting_score_max']: stopearly = True print('stopping early!', flush=True) break i += 1 # end tensorflow session print('closing tensorflow session...', flush=True) # ROLL BACK IF OVERFITTING if stopearly: print('rolling back...', flush=True) reporting_steps = reporting_steps[:i + 1] train_losses = train_losses[:i + 1] valid_losses = valid_losses[:i + 1] train_noisy_losses = train_noisy_losses[:i + 1] valid_noisy_losses = valid_noisy_losses[:i + 1] # selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']]) else: print('completed all training steps...', flush=True) # selected_step = reporting_steps[-1] selected_step = min([ max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]), reporting_steps[-1] ]) print('selected step:{0}...'.format(selected_step), flush=True) # SAVE RESULTS print('saving results...', flush=True) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) if d['use_batchnorm']: shutil.copyfile( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: shutil.move( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) if d['use_batchnorm']: shutil.move( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # PLOT LOSS print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # PLOT RECONSTRUCTIONS print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(2 * d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 6.5)) for i, ax in enumerate( axs.reshape(-1)[:d['reconstruction_rows'] * d['reconstruction_cols']]): if i < num_recons: ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0, alpha=0.1) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] x_valid = x_valid.astype('bool') lb = -0.05 ub = 1.05 for i, ax in enumerate( axs.reshape(-1)[d['reconstruction_rows'] * d['reconstruction_cols']:]): if i < num_recons: ax.boxplot( [xr_valid[i, ~x_valid[i, :]], xr_valid[i, x_valid[i, :]]], positions=[0.2, 0.8]) ax.set_ylim(lb, ub) ax.set_xlim(lb, ub) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb, linewidth=1, color='k') ax.axvline(ub, linewidth=1, color='k') ax.axhline(lb, linewidth=1, color='k') ax.axhline(ub, linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # PLOT 2D EMBEDDING if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() print('done training phase.', flush=True) return d['current_hidden_layer'], d['current_finetuning_run'], d[ 'current_epochs']
def main(validation_rep=0, validation_fold=0): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format( validation_rep, validation_fold) dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0] # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # load dataset print('loading dataset {0}...'.format(dataset_info['abbreviation']), flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify cross-validation parameters print('specifying cross-validation parameters...', flush=True) reps = 20 folds = 5 rf_trees = 1000 include_logistic_regression = True skf = StratifiedKFold(n_splits=folds, shuffle=True) print(' reps: {0!s}'.format(reps)) print(' folds: {0!s}'.format(folds)) # initialize models print('initializing models...', flush=True) rfmodel = RandomForestClassifier(n_estimators=rf_trees, oob_score=False, n_jobs=-1, class_weight='balanced') print(rfmodel) lrmodel = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1e3, fit_intercept=True, intercept_scaling=1e3, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) print(lrmodel) # initialize data matrices for collecting model feature importances and cross-validation performance stats print( 'initializing data matrices for collecting model feature importances and cross-validation performance stats...', flush=True) classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') sm = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='crossvalidation_classifier_performance_stats_vs_models', matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]), dtype='float64')) stat_model_rf_mean = copy.deepcopy(sm) stat_model_rf_stdv = copy.deepcopy(sm) stat_model_lr_mean = copy.deepcopy(sm) stat_model_lr_stdv = copy.deepcopy(sm) del sm fm = dataclasses.datamatrix( rowname=gene_atb.columnname, rowlabels=gene_atb.columnlabels.copy(), rowmeta=copy.deepcopy(gene_atb.columnmeta), columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='model_feature_importances', matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]), dtype='float64')) feature_model_rf = copy.deepcopy(fm) feature_model_lr = copy.deepcopy(fm) del fm # exclude validation and unlabeled examples from cross-validation loop print( 'excluding validation and unlabeled examples from cross-validation loop...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) Y = (gene_atb.rowmeta['class'][istraintest] == 'positive') #X = gene_atb.matrix[istraintest,:] # perform incremental feature elimination with cross-validation print( 'performing incremental feature elimination with cross-validation...', flush=True) for i in range(gene_atb.shape[1]): print(' features: {0!s}...'.format(gene_atb.shape[1] - i), flush=True) if i == 0: hit_rf = np.ones(gene_atb.shape[1], dtype='bool') hit_lr = np.ones(gene_atb.shape[1], dtype='bool') else: hit_rf = feature_model_rf.matrix[:, i - 1] > feature_model_rf.matrix[ feature_model_rf. matrix[:, i - 1] > 0, i - 1].min() #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min() hit_lr = hit_rf X_rf = gene_atb.matrix[istraintest, :][:, hit_rf] X_lr = gene_atb.matrix[istraintest, :][:, hit_lr] stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64') stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64') fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64') fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64') for rep in range(reps): print(' rep {0!s} of {1!s}...'.format(rep + 1, reps), flush=True) Ptest_rf = np.zeros(Y.size, dtype='float64') Ptest_lr = np.zeros(Y.size, dtype='float64') fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64') fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64') for fold, (train_indices, test_indices) in enumerate(skf.split(X_rf, Y)): print(' fold {0!s} of {1!s}...'.format( fold + 1, folds), flush=True) Y_train = Y[train_indices] X_rf_train = X_rf[train_indices] X_lr_train = X_lr[train_indices] #Y_test = Y[test_indices] X_rf_test = X_rf[test_indices] X_lr_test = X_lr[test_indices] rfmodel.fit(X_rf_train, Y_train) Ptest_rf[test_indices] = rfmodel.predict_proba( X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1) fi_fold_rf[:, fold] = rfmodel.feature_importances_ lrmodel.fit(X_lr_train, Y_train) Ptest_lr[test_indices] = lrmodel.predict_proba( X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1) fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1)) fi_rep_rf[:, rep] = fi_fold_rf.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_rf, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) fi_rep_lr[:, rep] = fi_fold_lr.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_lr, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1) feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_rf.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1) stat_model_rf_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1) stat_model_rf_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1) feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_lr.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1) stat_model_lr_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1) stat_model_lr_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) # concatenate data matrices with model feature importances print('concatenating data matrices with model feature importances...', flush=True) feature_model_rf.columnlabels += '_rf' feature_model_rf.columnmeta['model_type'] = np.full( feature_model_rf.shape[1], 'random_forest', dtype='object') feature_model_lr.columnlabels += '_lr' feature_model_lr.columnmeta['model_type'] = np.full( feature_model_lr.shape[1], 'logistic_regression', dtype='object') feature_model_rf.append(feature_model_lr, 1) feature_model = feature_model_rf del feature_model_rf, feature_model_lr # concatenate data matrices with model cross-validation performance stats print( 'concatenating data matrices with model cross-validation performance stats...', flush=True) stat_model_rf_mean.rowlabels += '_mean' stat_model_rf_stdv.rowlabels += '_stdv' stat_model_rf_mean.append(stat_model_rf_stdv, 0) stat_model_rf_mean.columnlabels += '_rf' stat_model_rf_mean.columnmeta['model_type'] = np.full( stat_model_rf_mean.shape[1], 'random_forest', dtype='object') stat_model_lr_mean.rowlabels += '_mean' stat_model_lr_stdv.rowlabels += '_stdv' stat_model_lr_mean.append(stat_model_lr_stdv, 0) stat_model_lr_mean.columnlabels += '_lr' stat_model_lr_mean.columnmeta['model_type'] = np.full( stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object') stat_model_rf_mean.append(stat_model_lr_mean, 1) stat_model = stat_model_rf_mean del stat_model_rf_mean # select simplest model (fewest features) with auroc and auprc within 95% of max print( 'selecting simplest model (fewest features) with auroc and auprc within 95% of max...', flush=True) model_scores = 0.5 * (stat_model.select('auroc_mean', []) + stat_model.select('auprc_mean', [])) if include_logistic_regression: selected_model_index = np.where( model_scores >= 0.95 * model_scores.max())[0][-1] else: selected_model_index = np.where( np.logical_and( model_scores >= 0.95 * model_scores[stat_model.columnmeta['model_type'] == 'random_forest'].max(), stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1] selected_model_name = stat_model.columnlabels[selected_model_index] selected_model_features = feature_model.rowlabels[ feature_model.matrix[:, selected_model_index] != 0] selected_model_type = stat_model.columnmeta['model_type'][ selected_model_index] selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel gene_atb = gene_atb.tolabels(columnlabels=selected_model_features) feature_model_selected = feature_model.tolabels( columnlabels=selected_model_name) stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name) print(' selected_model_name: {0}'.format(selected_model_name), flush=True) print(' selected_model_features: {0}'.format( '|'.join(selected_model_features)), flush=True) # iterate over selected features to rebuild design matrix print('iterating over selected features to rebuild design matrix...', flush=True) for i, (selected_feature, dataset_abbreviation) in enumerate( zip(gene_atb.columnlabels, gene_atb.columnmeta['dataset_abbreviation'])): # load dataset print(' loading dataset {0}...'.format(dataset_abbreviation), flush=True) dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format( validation_rep, validation_fold, dataset_abbreviation) gene_atb_i = datasetIO.load_datamatrix(dataset_path) gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'].astype('float64') gene_atb_i.columnmeta['dataset_abbreviation'] = np.full( gene_atb_i.shape[1], dataset_abbreviation, dtype='object') gene_atb_i.columnmeta[ 'dataset_feature'] = gene_atb_i.columnlabels.copy() gene_atb_i.columnlabels += '_' + dataset_abbreviation gene_atb_i.rowname = 'GeneSym' gene_atb_i.columnname = 'Feature' if dataset_abbreviation == 'gtextissue_cleaned': gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55', 0) # pesky duplicate row print(gene_atb_i) # select feature print(' selecting feature {0}...'.format(selected_feature), flush=True) gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1) # merge dataset print(' merging dataset...', flush=True) if i == 0: gene_atb_selected = copy.deepcopy(gene_atb_i) gene_atb_selected.matrixname = 'merged_target_features' print(' first dataset, no merge...', flush=True) else: common_genes = np.intersect1d(gene_atb_selected.rowlabels, gene_atb_i.rowlabels) gene_atb_selected = gene_atb_selected.tolabels( rowlabels=common_genes) gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes) gene_atb_selected.append(gene_atb_i, 1) print(' common_genes: {0!s}...'.format(common_genes.size), flush=True) # normalize features print('normalizing features...', flush=True) gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0) gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0) gene_atb_selected.matrix = ( gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape( 1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) - gene_atb_selected.columnmeta['min'].reshape(1, -1)) # update metadata print('updating metadata...', flush=True) assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all() for field, values in gene_atb.columnmeta.items(): if field not in gene_atb_selected.columnmeta: gene_atb_selected.columnmeta[field] = values print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format( gene_atb.shape[0], gene_atb_selected.shape[0]), flush=True) del gene_atb # refit selected model print('refitting selected model...', flush=True) isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples) isunknown = gene_atb_selected.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) selected_model.fit( gene_atb_selected.matrix[istraintest, :], gene_atb_selected.rowmeta['class'][istraintest] == 'positive') # get predictions for validation and unlabelled examples print('getting predictions for validation and unlabelled examples...', flush=True) gene_model_selected = dataclasses.datamatrix( rowname=gene_atb_selected.rowname, rowlabels=gene_atb_selected.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb_selected.rowmeta), columnname=stat_model_selected.columnname, columnlabels=stat_model_selected.columnlabels.copy(), columnmeta=copy.deepcopy(stat_model_selected.columnmeta), matrixname= 'success_probabilities_for_validation_and_unlabelled_examples', matrix=selected_model.predict_proba( gene_atb_selected.matrix)[:, selected_model.classes_ == 1]) gene_model_selected.discard(istraintest, 0) # save results print('saving {0!s} useful features and model results...'.format( gene_atb_selected.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['selected_model_name'] = selected_model_name dataset_info['selected_model_features'] = '|'.join(selected_model_features) dataset_info['selected_model_type'] = selected_model_type dataset_info['crossvalidation_reps'] = reps dataset_info['crossvalidation_folds'] = folds dataset_info['rf_trees'] = rf_trees dataset_info['include_logistic_regression'] = include_logistic_regression for stat_name, stat_values in zip(stat_model_selected.rowlabels, stat_model_selected.matrix): dataset_info[stat_name] = stat_values.item() datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected) datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder), stat_model) datasetIO.save_datamatrix( '{0}/feature_model.txt.gz'.format(results_folder), feature_model) datasetIO.save_datamatrix( '{0}/stat_model_selected.txt.gz'.format(results_folder), stat_model_selected) datasetIO.save_datamatrix( '{0}/feature_model_selected.txt.gz'.format(results_folder), feature_model_selected) datasetIO.save_datamatrix( '{0}/gene_model_selected.txt.gz'.format(results_folder), gene_model_selected) datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(adjustments_path): # read adjustments print('reading adjustments...', flush=True) designpath_selectedstep = {} with open(adjustments_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} adjustments...'.format(len(designpath_selectedstep)), flush=True) # make adjustments print('making adjustments...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix( '{0}/{1}.pickle'.format(d['input_path'], partition)) if 'all' not in dataset: dataset['all'] = copy.deepcopy(dataset[partition]) else: dataset['all'].append(dataset[partition], 0) # get parameters for marginal distributions # will sample from marginal distributions to impute missing values # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli') # for other features, model as gaussian marginalprobabilities = ( 1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / ( 2 + np.sum( ~np.isnan(dataset['train'].matrix), 0, keepdims=True) ) # posterior mean of beta-bernoulli with prior a=b=1 marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True) isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] == 'bernoulli').astype('float64').reshape( 1, -1) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np': tsdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np': tsdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np': tsdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np': tsdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d[ 'current_dimensions'] == d['all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True print('current_apply_activation_to_embedding: {0!s}'.format( d['current_apply_activation_to_embedding']), flush=True) # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # move files print('moving files...', flush=True) if os.path.exists( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('variables do no exist for selected step! skipping...', flush=True) continue if d['use_batchnorm']: if os.path.exists( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print( 'batchnorm variables do no exist for selected step! skipping...', flush=True) continue # load model variables print('loading model variables...', flush=True) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = tsdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # load reporting variables print('loading reporting variables...', flush=True) if os.path.exists( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: optimization_path = pickle.load(fr) reporting_steps = optimization_path['reporting_steps'] valid_losses = optimization_path['valid_losses'] train_losses = optimization_path['train_losses'] valid_noisy_losses = optimization_path['valid_noisy_losses'] train_noisy_losses = optimization_path['train_noisy_losses'] else: reporting_steps = np.zeros(0, dtype='int32') valid_losses = np.zeros(0, dtype='float32') train_losses = np.zeros(0, dtype='float32') valid_noisy_losses = np.zeros(0, dtype='float32') train_noisy_losses = np.zeros(0, dtype='float32') with open( '{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rt') as fr: fr.readline() for line in fr: step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [ float(x.strip()) for x in line.split('\t') ] reporting_steps = np.insert(reporting_steps, reporting_steps.size, step) valid_losses = np.insert(valid_losses, valid_losses.size, valid_loss) train_losses = np.insert(train_losses, train_losses.size, train_loss) valid_noisy_losses = np.insert(valid_noisy_losses, valid_noisy_losses.size, valid_noisy_loss) train_noisy_losses = np.insert(train_noisy_losses, train_noisy_losses.size, train_noisy_loss) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in ['all']: if np.isnan(dataset[partition].matrix).any(): print('datamatrix has missing values. random imputation...', flush=True) dp = copy.deepcopy(dataset[partition]) is_missing = np.isnan(dp.matrix) for i in range(5): print('impute iteration {0!s}'.format(i), flush=True) normal_noise = np.random.randn(dp.shape[0], dp.shape[1]) * marginalstdvs bernoulli_noise = (np.random.rand(dp.shape[0], dp.shape[1]) <= marginalprobabilities).astype('float64') noise = bernoulli_noise * isbernoullimarginal + normal_noise * ( 1 - isbernoullimarginal) dp.matrix[is_missing] = noise[is_missing] if i == 0: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables= batchnorm_encode_variables, bn_decode_variables= batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables ) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False) else: if d['use_batchnorm']: reconi, embedi, errori = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivationi = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: reconi, embedi, errori = tsdae_apply_functions.encode_and_decode( dp, W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivationi = tsdae_apply_functions.encode( dp, W, Be, activation_function['np'], apply_activation_to_embedding=False) recon[partition].matrix += reconi.matrix embed[partition].matrix += embedi.matrix error[partition] += errori if d['current_apply_activation_to_embedding']: embed_preactivation[ partition].matrix += embed_preactivationi.matrix recon[partition].matrix /= 5 embed[partition].matrix /= 5 error[partition] /= 5 if d['current_apply_activation_to_embedding']: embed_preactivation[partition].matrix /= 5 else: if d['use_batchnorm']: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[ partition] = tsdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], dataset['train'].columnmeta['likelihood'] == 'bernoulli', return_embedding=True, return_reconstruction_error=True) if d['current_apply_activation_to_embedding']: embed_preactivation[ partition] = tsdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) for partition in partitions: recon[partition] = recon['all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) embed[partition] = embed['all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) if d['current_apply_activation_to_embedding']: embed_preactivation[partition] = embed_preactivation[ 'all'].tolabels( rowlabels=dataset[partition].rowlabels.copy()) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # plot loss print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) ax.set_ylim(0, 10) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset[ 'valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] xr_valid = recon[ 'valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] != 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.nanmin(np.append(x_valid, xr_valid, 1), 1) ub = np.nanmax(np.append(x_valid, xr_valid, 1), 1) fg, axs = plt.subplots(2 * d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 6.5)) for i, ax in enumerate( axs.reshape(-1)[:d['reconstruction_rows'] * d['reconstruction_cols']]): hit = np.logical_and(np.isfinite(x_valid[i, :]), np.isfinite(xr_valid[i, :])) if i < num_recons and hit.any(): ax.plot(x_valid[i, hit], xr_valid[i, hit], 'ok', markersize=0.5, markeredgewidth=0, alpha=0.1) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) x_valid = dataset['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] xr_valid = recon['valid'].matrix[:num_recons, dataset['train']. columnmeta['likelihood'] == 'bernoulli'] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = -0.1 ub = 1.1 for i, ax in enumerate( axs.reshape(-1)[d['reconstruction_rows'] * d['reconstruction_cols']:]): hit = np.logical_and(np.isfinite(x_valid[i, :]), np.isfinite(xr_valid[i, :])) if i < num_recons and hit.any(): ax.boxplot([ xr_valid[i, x_valid[i, :] == 0], xr_valid[i, x_valid[i, :] == 1] ], positions=[0.2, 0.8], flierprops={ 'markersize': 0.5, 'markeredgewidth': 0, 'alpha': 0.1 }, boxprops={'linewidth': 0.5}, whiskerprops={'linewidth': 0.5}, medianprops={'linewidth': 0.5}) ax.set_ylim(lb, ub) ax.set_xlim(lb, ub) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb, linewidth=1, color='k') ax.axvline(ub, linewidth=1, color='k') ax.axhline(lb, linewidth=1, color='k') ax.axhline(ub, linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2 and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot heatmap else: print('plotting embedding heatmap...', flush=True) embed['valid'].cluster('all', 'cosine', 'average') embed['valid'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/embedding_heatmap_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), closefigure=True, dpi=300) if d['current_apply_activation_to_embedding']: embed_preactivation['valid'].cluster('all', 'cosine', 'average') embed_preactivation['valid'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), closefigure=True, dpi=300) # log selected step with open('{0}/log.txt'.format(d['output_path']), mode='at', buffering=1) as fl: fl.write('\nadjusted selected step:{0}\n'.format(selected_step)) print('done adjust_early_stopping.', flush=True)
def main(search_config_path, job_config_path, model_config_path, search_id='max'): # load search configuration print('loading search configuration...', flush=False) print('search_config_path: {0}'.format(search_config_path), flush=False) sc = load_config(search_config_path) # parse some search configurations if type(sc['bopt_config']['acquisition_type']) == str: acquisition_types = [sc['bopt_config']['acquisition_type']] else: acquisition_types = sc['bopt_config']['acquisition_type'] del sc['bopt_config']['acquisition_type'] if type(sc['bopt_config']['model_type']) == str: model_types = [sc['bopt_config']['model_type']] else: model_types = sc['bopt_config']['model_type'] del sc['bopt_config']['model_type'] if type(sc['bopt_config']['model_estimation_method']) == str: model_estimation_methods = [sc['bopt_config']['model_estimation_method'].lower()] else: model_estimation_methods = [mem.lower() for mem in sc['bopt_config']['model_estimation_method']] del sc['bopt_config']['model_estimation_method'] if type(sc['bopt_config']['kernel']) == str: kernels = [sc['bopt_config']['kernel']] else: kernels = sc['bopt_config']['kernel'] del sc['bopt_config']['kernel'] if sc['y_transformation'].lower() == 'neglog': transform_y = lambda y: -np.log10(y) elif sc['y_transformation'].lower() == 'log': transform_y = lambda y: np.log10(y) elif sc['y_transformation'].lower() == 'neg': transform_y = lambda y: -y else: transform_y = lambda y: y # load job configuration print('loading job configuration...', flush=False) print('job_config_path: {0}'.format(job_config_path), flush=False) jc = load_config(job_config_path) # load job runner print('loading job runner...', flush=False) jc['script_folder'], jc['script_file'] = jc['script_path'].rsplit('/', maxsplit=1) sys.path.append(jc['script_folder']) job_runner = __import__(jc['script_file'].replace('.py','')) # load model configuration print('loading model configuration...', flush=False) print('model_config_path: {0}'.format(model_config_path), flush=False) mc = load_config(model_config_path) # load datamatrix print('loading datamatrix...', flush=False) print('datamatrix_path: {0}'.format(mc['datamatrix_path']), flush=False) datamatrix = datasetIO.load_datamatrix(mc['datamatrix_path']) # set folders print('setting folders...', flush=False) project_folder = '/'.join(job_config_path.split('/')[:-1]) print('project_folder: {0}'.format(project_folder), flush=False) if search_id == 'max': search_folders = [x for x in os.listdir(project_folder) if x[:10] == 'hp_search_'] search_ids = [int(x.split('_')[-1]) for x in search_folders] search_id = max(search_ids) else: search_id = int(search_id) search_folder = '{0}/hp_search_{1!s}'.format(project_folder, search_id) if not os.path.exists(search_folder): os.makedirs(search_folder) print('current search_folder: {0}'.format(search_folder), flush=False) print('current search_id: {0!s}'.format(search_id), flush=False) # create base model_config and collect hyperparameter search specs print('creating base model_config and collecting hyperparameter search specs...', flush=False) hyperparameter_specs = {} hyperparameters = [] domains = [] base_model_config = {'search_domain':{}, 'model_space':{}} for parameter, specs in mc.items(): if type(specs) == dict: if specs['type'] == 'categorical': model_space_values = specs['domain'].copy() specs['domain'] = list(range(len(model_space_values))) specs['grid'] = specs['domain'].copy() specs['search_domain_to_model_space'] = lambda search_domain_value, sdvs=specs['domain'].copy(), msvs=model_space_values.copy(): {sdv:msv for sdv, msv in zip(sdvs, msvs)}[search_domain_value] specs['model_space_to_search_domain'] = lambda model_space_value, sdvs=specs['domain'].copy(), msvs=model_space_values.copy(): {msv:sdv for sdv, msv in zip(sdvs, msvs)}[model_space_value] base_model_config['search_domain'][parameter] = specs['domain'][0] elif specs['type'] == 'discrete' or specs['type'] == 'continuous': model_space_values = sorted(specs['domain']) model_space_min = model_space_values[0] model_space_max = model_space_values[-1] model_space_mid = (model_space_min + model_space_max)/2 if specs['transformation'] == 'linear': specs['search_domain_to_model_space'] = lambda search_domain_value, msmax=model_space_max, msmin=model_space_min: (msmax - msmin)*search_domain_value + msmin specs['model_space_to_search_domain'] = lambda model_space_value, msmax=model_space_max, msmin=model_space_min: (model_space_value - msmin)/(msmax - msmin) elif specs['transformation'] == 'log': specs['search_domain_to_model_space'] = lambda search_domain_value, msmax=model_space_max, msmin=model_space_min: msmin*(msmax/msmin)**search_domain_value specs['model_space_to_search_domain'] = lambda model_space_value, msmax=model_space_max, msmin=model_space_min: log(model_space_value/msmin, msmax/msmin) else: raise ValueError('invalid hyperparameter transformation. must be linear or log.') specs['domain'] = [specs['model_space_to_search_domain'](msv) for msv in model_space_values] if specs['type'] == 'discrete': if sc['num_grid_points_per_hyperparameter'] < 2: specs['grid'] = [specs['model_space_to_search_domain'](model_space_values[np.argmin([(msv - model_space_mid)**2 for msv in model_space_values])])] elif len(specs['domain']) <= sc['num_grid_points_per_hyperparameter']: specs['grid'] = specs['domain'].copy() else: interval = round(len(specs['domain'])/sc['num_grid_points_per_hyperparameter']) specs['grid'] = specs['domain'][::interval] if len(specs['grid']) < sc['num_grid_points_per_hyperparameter']: specs['grid'].append(specs['domain'][-1]) elif len(specs['grid']) == sc['num_grid_points_per_hyperparameter']: specs['grid'][-1] = specs['domain'][-1] else: while len(specs['grid']) > sc['num_grid_points_per_hyperparameter']: del(specs['grid'][-2]) else: if sc['num_grid_points_per_hyperparameter'] < 2: specs['grid'] = [specs['model_space_to_search_domain'](model_space_mid)] else: specs['grid'] = np.linspace(specs['domain'][0], specs['domain'][-1], sc['num_grid_points_per_hyperparameter']) if len(specs['grid']) < 3: base_model_config['search_domain'][parameter] = specs['grid'][0] else: base_model_config['search_domain'][parameter] = specs['grid'][round(len(specs['grid'])/2)-1] else: raise ValueError('invalid hyperparameter type. must be categorical, discrete, or continuous.') base_model_config['model_space'][parameter] = specs['search_domain_to_model_space'](base_model_config['search_domain'][parameter]) hyperparameter_specs[parameter] = copy.deepcopy(specs) hyperparameters.append(parameter) domains.append({'name':parameter, 'type':specs['type'], 'domain':specs['domain'].copy()}) else: base_model_config['model_space'][parameter] = specs base_model_config['search_domain'][parameter] = specs # create hyperparameter variations X = {'search_domain':np.zeros((0, len(hyperparameters)), dtype='float64'), 'model_space':np.zeros((0, len(hyperparameters)), dtype='object')} Y = np.zeros(0, dtype='float64') is_queued = np.ones(0, dtype='bool') is_completed = np.zeros(0, dtype='bool') is_success = np.zeros(0, dtype='bool') search_types = np.zeros(0, dtype='object') combination_ids = np.zeros(0, dtype='int64') job_ids = np.zeros(0, dtype='object') job_run_times = np.zeros(0, dtype='float64') job_start_times = np.zeros(0, dtype='float64') num_completed_combinations_at_job_start = np.zeros(0, dtype='int64') model_config_paths = [] model_configs = [] # create initial hyperparameter combinations print('creating initial hyperparameter combinations...', flush=False) hyperparameter_combinations, hyperparameter_search_types = create_initial_hyperparameter_combinations(sc['search_type'], sc['grid_suggestion_probability'], base_model_config, hyperparameters, domains, hyperparameter_specs) # iterate over hyperparameter combinations for combination_id, (combination_values, search_type) in enumerate(zip(hyperparameter_combinations, hyperparameter_search_types)): print('working on hyperparameter combination {0!s}...'.format(combination_id), flush=False) # create model_config print('creating model_config...', flush=False) model_config = create_model_config(combination_values, hyperparameters, hyperparameter_specs, base_model_config) # create save_folder and save model_config print('creating save_folder and saving model_config...', flush=False) model_config, model_config_path = save_model_config(model_config, combination_id, search_folder) # update search data arrays print('updating search data arrays...', flush=False) X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs = append_search_data_arrays(model_config, model_config_path, search_type, combination_id, hyperparameters, X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs) # save initial search data print('saving initial search data...', flush=False) search_data_path = '{0}/hp_search_data.pickle'.format(search_folder) save_search_data(search_data_path, X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs, base_model_config, hyperparameter_specs, hyperparameters, domains) # run search print('running search...', flush=False) start_time = time.time() elapsed_time = (time.time() - start_time)/3600.0 num_suggestions = 0 num_iterations = 0 num_completed_combinations = 0 num_successful_combinations = 0 prev_num_successful_combinations = 0 max_consecutive_bopt_failures = 5 num_consecutive_bopt_failures = 0 z_headers = ['iterations', 'suggestions', 'time', 'completions', 'minimum', 'index', 'job_run_time', 'search_type'] + hyperparameters Z = [] while (elapsed_time < sc['search_time'] and num_suggestions < sc['max_suggestions'] and num_consecutive_bopt_failures < max_consecutive_bopt_failures and (sc['search_type'] == 'bopt' or sc['search_type'] == 'random')) or is_queued.any(): num_active_combinations = count_active_jobs(job_ids) print('ACTIVE COMBINATIONS: {0!s}'.format(num_active_combinations), flush=False) Y, is_completed, is_success, job_run_times = update_y(Y, transform_y, sc['y_failure_value'], is_queued, is_completed, is_success, job_ids, job_run_times, model_configs) num_completed_combinations = is_completed.sum() print('COMPLETED COMBINATIONS: {0!s}'.format(num_completed_combinations), flush=False) num_successful_combinations = is_success.sum() print('SUCCESSFUL COMBINATIONS: {0!s}'.format(num_successful_combinations), flush=False) if num_successful_combinations > prev_num_successful_combinations: prev_num_successful_combinations = num_successful_combinations i_min = np.nanargmin(Y) print('MINIMUM LOSS: {0!s}'.format(Y[i_min]), flush=False) Z.append([num_iterations, num_suggestions, elapsed_time, num_completed_combinations, Y[i_min], i_min, job_run_times[i_min], search_types[i_min]] + X['model_space'][i_min,:].tolist()) print({k:v for k,v in zip(z_headers, Z[-1])}, flush=False) suggest_grid_point = (np.random.rand() < sc['grid_suggestion_probability']) or (sc['grid_suggestion_probability'] == 1) or sc['search_type'] == 'line' or sc['search_type'] == 'grid' if num_active_combinations >= sc['max_active_points']: print('REACHED MAX ACTIVE COMBINATIONS: {0!s}. WAITING...'.format(sc['max_active_points']), flush=False) time.sleep(59) elif is_queued.any() and (suggest_grid_point or elapsed_time > sc['search_time'] or num_suggestions > sc['max_suggestions']): combination_id = is_queued.nonzero()[0][0] # print('SUBMITTING QUEUED GRID OR LINE COMBINATION {0!s}...'.format(combination_id), flush=False) # job_id = submit_job(jc, model_config_paths[combination_id]) print('RUNNING QUEUED GRID OR LINE COMBINATION {0!s}...'.format(combination_id), flush=False) job_id = run_job(job_runner, model_config_paths[combination_id], copy.deepcopy(datamatrix), combination_id) is_queued, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start = update_job_info(job_id, combination_id, num_completed_combinations, is_queued, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start) num_suggestions += 1 else: suggest_random_point = (np.random.rand() < sc['random_suggestion_probability']) or (sc['random_suggestion_probability'] == 1) or sc['search_type'] == 'random' if suggest_random_point or num_successful_combinations < sc['min_initial_points']: print('CREATING NEW RANDOM COMBINATION...', flush=False) hyperparameter_combinations, hyperparameter_search_types = create_random_search_combinations(1, domains) else: print('CREATING NEW BOPT COMBINATION...', flush=False) try: hyperparameter_combinations, hyperparameter_search_types = create_bopt_search_combinations(X, Y, domains, sc['bopt_config'], acquisition_types, model_types, model_estimation_methods, kernels) num_consecutive_bopt_failures = 0 except: bopt_error = sys.exc_info() num_consecutive_bopt_failures += 1 print('BOPT ERROR: {0} {1}'.format(bopt_error[0], bopt_error[1]), flush=False) print('num_consecutive_bopt_failures: {0!s}'.format(num_consecutive_bopt_failures), flush=False) hyperparameter_combinations = np.zeros((0, len(hyperparameters)), dtype='float64') hyperparameter_search_types = np.zeros(0, dtype='object') # iterate over hyperparameter combinations for combination_values, search_type in zip(hyperparameter_combinations, hyperparameter_search_types): combination_id = combination_ids.size print('working on hyperparameter combination {0!s}...'.format(combination_id), flush=False) # create model_config print('creating model_config...', flush=False) model_config = create_model_config(combination_values, hyperparameters, hyperparameter_specs, base_model_config) # create save_folder and save model_config print('creating save_folder and saving model_config...', flush=False) model_config, model_config_path = save_model_config(model_config, combination_id, search_folder) # update search data arrays print('updating search data arrays...', flush=False) X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs = append_search_data_arrays(model_config, model_config_path, search_type, combination_id, hyperparameters, X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs) # print('SUBMITTING NEW COMBINATION {0!s}...'.format(combination_id), flush=False) # job_id = submit_job(jc, model_config_paths[combination_id]) print('RUNNING NEW COMBINATION {0!s}...'.format(combination_id), flush=False) job_id = run_job(job_runner, model_config_paths[combination_id], copy.deepcopy(datamatrix), combination_id) is_queued, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start = update_job_info(job_id, combination_id, num_completed_combinations, is_queued, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start) num_suggestions += 1 # save search data print('saving search data...', flush=False) save_search_data(search_data_path, X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs, base_model_config, hyperparameter_specs, hyperparameters, domains) time.sleep(1) elapsed_time = (time.time() - start_time)/3600.0 num_iterations += 1 # wait for jobs to finish num_active_combinations = count_active_jobs(job_ids) while num_active_combinations > 0 and elapsed_time < (sc['search_time'] + 1 - 600/3600): print('ACTIVE COMBINATIONS: {0!s}. WAITING FOR JOBS TO FINISH...'.format(num_active_combinations), flush=False) time.sleep(300) num_active_combinations = count_active_jobs(job_ids) elapsed_time = (time.time() - start_time)/3600.0 # final search data collection print('collecting final results...', flush=False) Y, is_completed, is_success, job_run_times = update_y(Y, transform_y, sc['y_failure_value'], is_queued, is_completed, is_success, job_ids, job_run_times, model_configs) # save search data print('saving search data...', flush=False) save_search_data(search_data_path, X, Y, is_queued, is_completed, is_success, search_types, combination_ids, job_ids, job_run_times, job_start_times, num_completed_combinations_at_job_start, model_config_paths, model_configs, base_model_config, hyperparameter_specs, hyperparameters, domains) print('done search_hp.py', flush=False)
def main(adjustments_path): # read adjustments print('reading adjustments...', flush=True) designpath_selectedstep = {} with open(adjustments_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} adjustments...'.format(len(designpath_selectedstep)), flush=True) # make adjustments print('making adjustments...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: if partition == 'train': dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'valid')) dataset[partition].append(datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'test')), 0) elif partition == 'valid': dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], 'train')) else: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format(d['input_path'], partition)) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np':sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np':sdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np':sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np':sdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d['all_dimensions'][:d['current_hidden_layer']+1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int(np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]])/2))) d['reconstruction_cols'] = 2*d['reconstruction_rows'] # move files print('moving files...', flush=True) if os.path.exists('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile('{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('variables do no exist for selected step! skipping...', flush=True) continue if d['use_batchnorm']: if os.path.exists('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step)): if os.path.exists('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): shutil.move('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}_old.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) shutil.copyfile('{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: print('batchnorm variables do no exist for selected step! skipping...', flush=True) continue # load model variables print('loading model variables...', flush=True) with open('{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open('{0}/batchnorm_variables_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: batchnorm_variables = pickle.load(fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables(batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # load reporting variables print('loading reporting variables...', flush=True) if os.path.exists('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])): with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: optimization_path = pickle.load(fr) reporting_steps = optimization_path['reporting_steps'] valid_losses = optimization_path['valid_losses'] train_losses = optimization_path['train_losses'] valid_noisy_losses = optimization_path['valid_noisy_losses'] train_noisy_losses = optimization_path['train_noisy_losses'] else: reporting_steps = np.zeros(0, dtype='int32') valid_losses = np.zeros(0, dtype='float32') train_losses = np.zeros(0, dtype='float32') valid_noisy_losses = np.zeros(0, dtype='float32') train_noisy_losses = np.zeros(0, dtype='float32') with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rt') as fr: fr.readline() for line in fr: step, train_loss, valid_loss, train_noisy_loss, valid_noisy_loss, time = [float(x.strip()) for x in line.split('\t')] reporting_steps = np.insert(reporting_steps, reporting_steps.size, step) valid_losses = np.insert(valid_losses, valid_losses.size, valid_loss) train_losses = np.insert(train_losses, train_losses.size, train_loss) valid_noisy_losses = np.insert(valid_noisy_losses, valid_noisy_losses.size, valid_noisy_loss) train_noisy_losses = np.insert(train_noisy_losses, train_noisy_losses.size, train_noisy_loss) with open('{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump({'reporting_steps':reporting_steps, 'valid_losses':valid_losses, 'train_losses':train_losses, 'valid_noisy_losses':valid_noisy_losses, 'train_noisy_losses':train_noisy_losses}, fw) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) else: recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format(partition, error[partition]), flush=True) datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix('{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix('{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # plot loss print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25,2.25)) ax.set_position([0.55/3.25, 0.45/2.25, 2.6/3.25, 1.7/2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0]-1, reporting_steps[-1]+1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left=True, right=True, bottom=True, top=False, labelleft=True, labelright=False, labelbottom=True, labeltop=False, labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([d['reconstruction_rows']*d['reconstruction_cols'], dataset['valid'].shape[0]]) x_valid = dataset['valid'].matrix[:num_recons,:] xr_valid = recon['valid'].matrix[:num_recons,:] if x_valid.shape[1] > 1000: x_valid = x_valid[:,:1000] xr_valid = xr_valid[:,:1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5,3.25)) for i, ax in enumerate(axs.reshape(-1)): if i < num_recons: ax.plot(x_valid[i,:], xr_valid[i,:], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5)) ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5]) ax.plot(embed['train'].matrix[:,0], embed['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:,0], embed['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5,6.5)) ax.set_position([0.15/6.5, 0.15/6.5, 6.2/6.5, 6.2/6.5]) ax.plot(embed_preactivation['train'].matrix[:,0], embed_preactivation['train'].matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:,0], embed_preactivation['valid'].matrix[:,1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'.format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # log selected step with open('{0}/log.txt'.format(d['output_path']), mode='at', buffering=1) as fl: fl.write('\nadjusted selected step:{0}\n'.format(selected_step)) print('done adjust_early_stopping.', flush=True)
@author: ar988996 """ import sys sys.path.append('../../utilities') import numpy as np import datasetIO import mapper import os import shutil # load the data print('loading data...', flush=True) gene_atb = datasetIO.load_datamatrix( '../../original_data/phenodigm/geneid_meshid_datamatrix_trimmed.csv.gz', delimiter=',', getmetadata=False) gene_atb.rowname = 'entrez_id' gene_atb.columnname = 'mesh_id' gene_atb.matrixname = 'gene_disease_associations_from_phenodigm-qtq' # THRESHOLD the data # what do the values mean? # values have a strange distribution. 50% are less than 0.2, 97% are less than 0.5. min value is 0.08. max value is 1.15. print('thresholding data...', flush=True) gene_atb.matrix = np.float64(gene_atb.matrix > 0) gene_atb.matrixname += '_thresholded' print('saving thresholded data...', flush=True) datasetIO.save_datamatrix( '../../original_data/phenodigm/gene_disease_phenodigm-qtq_trimmed_thresholded.pickle', gene_atb)
import sys sys.path.append('../../utilities') import numpy as np import copy import pickle import datasetIO import mapper import os import shutil from matplotlib import pyplot as plt # load the data gene_atb = datasetIO.load_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle' ) # scale counts gene_atb.matrix = np.exp( np.log(gene_atb.matrix) - np.log(gene_atb.columnmeta['auc'].reshape(1, -1)) + (np.log(4) + 7 * np.log(10))) gene_atb.matrixname += '_scaledcounts' datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.pickle', gene_atb) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_scaledcounts.txt.gz', gene_atb)
def main(datamatrix_path, test_index, response_variable_name, valid_index, valid_fraction, feature_fraction, regularization_type, inverse_regularization_strength, intercept_scaling, pos_neg_weight_ratio, evaluation_statistic, save_weights, save_folder, datamatrix): print('loading datamatrix...', flush=False) if datamatrix == None or type(datamatrix) == str: dm = datasetIO.load_datamatrix(datamatrix_path) else: dm = datamatrix print('setting random seed with test_index {0!s}...'.format(test_index), flush=False) np.random.seed(test_index) print('getting bootstrap sample...', flush=False) all_indices = np.arange(dm.shape[0]) boot_indices = np.random.choice(dm.shape[0], dm.shape[0], replace=True) test_indices = all_indices[~np.in1d(all_indices, boot_indices)] print('reserving out-of-bag samples as test set...', flush=False) Y = { 'test': dm.rowmeta[response_variable_name][test_indices].astype('bool') } X = {'test': dm.matrix[test_indices, :]} print('setting random seed with valid_index {0!s}...'.format(valid_index), flush=False) np.random.seed(valid_index) print('splitting bootstrap sample into training and validation sets...', flush=False) if type(valid_fraction) == str and (valid_fraction.lower() == 'loo' or valid_fraction.lower() == 'loocv'): valid_fraction = 'loo' valid_indices = all_indices train_indices = all_indices else: valid_indices = np.random.choice(dm.shape[0], round(valid_fraction * dm.shape[0]), replace=False) train_indices = all_indices[~np.in1d(all_indices, valid_indices)] Y['train'] = dm.rowmeta[response_variable_name][boot_indices][ train_indices].astype('bool') Y['valid'] = dm.rowmeta[response_variable_name][boot_indices][ valid_indices].astype('bool') X['train'] = dm.matrix[boot_indices, :][train_indices, :] X['valid'] = dm.matrix[boot_indices, :][valid_indices, :] print('fitting and evaluating models...', flush=False) stages = ['validation', 'testing'] data_subsets = ['fit', 'predict'] performance_stats = [ 'auroc', 'auprc', 'brier', 'nll', 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'mcc', 'fnlp', 'f1', 'f1_100', 'f1_50', 'f1_25', 'f1_10', 'f1_5', 'f1_3', 'f1_2', 'f2', 'f3', 'f5', 'f10', 'f25', 'f50', 'f100' ] if valid_fraction == 'loo': X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': X['train'], 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': Y['train'], 'predict': Y['test'] } }) else: X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': np.append(X['train'], X['valid'], 0), 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': np.append(Y['train'], Y['valid']), 'predict': Y['test'] } }) stat_subset = {} for stage in stages: print('working on {0} stage...'.format(stage), flush=False) if feature_fraction < 1: print('performing univariate feature selection...', flush=False) num_features = round(feature_fraction * dm.shape[1]) test_stats, p_values = ttest_ind( X[stage]['fit'][Y[stage]['fit'], :], X[stage]['fit'][~Y[stage]['fit'], :], axis=0, equal_var=False, nan_policy='propagate') ranks = np.argsort(p_values) selected_indices = ranks[:num_features] selected_features = dm.columnlabels[selected_indices] if stage == 'testing': print('plotting univariate test statistics...', flush=False) plt.figure() plt.hist(test_stats, 50) plt.savefig( '{0}/univariate_test_statistics.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(p_values, 50) plt.savefig('{0}/univariate_pvalues.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(-np.log10(p_values), 50) plt.savefig('{0}/univariate_nlps.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) else: print('skipping univariate feature selection...', flush=False) selected_indices = np.arange(dm.shape[1], dtype='int64') selected_features = dm.columnlabels.copy() print('selected {0!s} features...'.format(selected_features.size), flush=False) print('calculating class weights...', flush=False) pos_weight = np.sqrt(pos_neg_weight_ratio) * ( (Y[stage]['fit'].size) / 2 / (Y[stage]['fit'].sum()) ) # (assign weight to class)*(adjust for unbalanced classes) neg_weight = (1 / pos_weight) * ( (Y[stage]['fit'].size) / 2 / ((~Y[stage]['fit']).sum()) ) # (assign weight to class)*(adjust for unbalanced classes) class_weight = {True: pos_weight, False: neg_weight} print('fitting model...', flush=False) logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][:, selected_indices], Y[stage]['fit']) if stage == 'testing': print('plotting feature weights...', flush=False) iter_feature = DataMatrix( rowname='iteration', rowlabels=np.array( ['test{0!s}_valid{1!s}'.format(test_index, valid_index)], dtype='object'), rowmeta={ 'intercept': logistic_regression_model.intercept_, 'test_index': np.array([test_index], dtype='int64'), 'valid_index': np.array([valid_index], dtype='int64') }, columnname=dm.columnname, columnlabels=dm.columnlabels.copy(), columnmeta=copy.deepcopy(dm.columnmeta), matrixname='feature_weights', matrix=np.zeros((1, dm.shape[1]), dtype='float64')) feature_idx = {f: i for i, f in enumerate(dm.columnlabels)} for feature, weight in zip(selected_features, logistic_regression_model.coef_[0, :]): iter_feature.matrix[0, feature_idx[feature]] = weight plt.figure() plt.hist(iter_feature.matrix[0, :], 50) plt.savefig('{0}/feature_weights.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if feature_fraction < 1: plt.figure() plt.hist(iter_feature.matrix[0, selected_indices], 50) plt.savefig( '{0}/feature_weights_selected.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if save_weights: print('saving feature weights...', flush=False) datasetIO.save_datamatrix( '{0}/iter_feature_datamatrix.txt.gz'.format(save_folder), iter_feature) print('creating datamatrix for performance statistics...', flush=False) stat_subset[stage] = DataMatrix( rowname='performance_statistic', rowlabels=np.array(performance_stats, dtype='object'), rowmeta={}, columnname='data_subset', columnlabels=np.array(data_subsets, dtype='object'), columnmeta={}, matrixname='classifier_performance_on_data_subsets', matrix=np.zeros((len(performance_stats), len(data_subsets)), dtype='float64')) for j, subset in enumerate(stat_subset[stage].columnlabels): print('evaluating performance on {0} subset...'.format(subset), flush=False) if valid_fraction == 'loo' and stage == 'validation' and subset == 'predict': P_pred = np.zeros(X[stage][subset].shape[0], dtype='float64') for train_index, test_index in LeaveOneOut().split( X[stage][subset]): logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][train_index, :][:, selected_indices], Y[stage]['fit'][train_index]) P_pred[ test_index] = logistic_regression_model.predict_proba( X[stage][subset][test_index, :][:, selected_indices] )[:, logistic_regression_model.classes_ == 1][0][0] else: P_pred = logistic_regression_model.predict_proba( X[stage][subset][:, selected_indices] )[:, logistic_regression_model.classes_ == 1] Y_pred = P_pred > 0.5 auroc = roc_auc_score(Y[stage][subset], P_pred) auprc = average_precision_score(Y[stage][subset], P_pred) brier = brier_score_loss(Y[stage][subset], P_pred) nll = log_loss(Y[stage][subset], P_pred) tn, fp, fn, tp = confusion_matrix(Y[stage][subset], Y_pred).ravel() # incorporate a prior with effective sample size = n_eff, where prior represents random predictions n_eff = 1 prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_eff * prevalence / 2 fn += n_eff * prevalence / 2 tn += n_eff * (1 - prevalence) / 2 fp += n_eff * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) precision = ppv recall = tpr f1 = (1 + (1**2)) * precision * recall / ((1**2) * precision + recall) f1_100 = (1 + (1 / 100**2)) * precision * recall / ( (1 / 100**2) * precision + recall) f1_50 = (1 + (1 / 50**2)) * precision * recall / ( (1 / 50**2) * precision + recall) f1_25 = (1 + (1 / 25**2)) * precision * recall / ( (1 / 25**2) * precision + recall) f1_10 = (1 + (1 / 10**2)) * precision * recall / ( (1 / 10**2) * precision + recall) f1_5 = (1 + (1 / 5**2)) * precision * recall / ( (1 / 5**2) * precision + recall) f1_3 = (1 + (1 / 3**2)) * precision * recall / ( (1 / 3**2) * precision + recall) f1_2 = (1 + (1 / 2**2)) * precision * recall / ( (1 / 2**2) * precision + recall) f2 = (1 + (2**2)) * precision * recall / ((2**2) * precision + recall) f3 = (1 + (3**2)) * precision * recall / ((3**2) * precision + recall) f5 = (1 + (5**2)) * precision * recall / ((5**2) * precision + recall) f10 = (1 + (10**2)) * precision * recall / ( (10**2) * precision + recall) f25 = (1 + (25**2)) * precision * recall / ( (25**2) * precision + recall) f50 = (1 + (50**2)) * precision * recall / ( (50**2) * precision + recall) f100 = (1 + (100**2)) * precision * recall / ( (100**2) * precision + recall) stat_subset[stage].matrix[:, j] = [ auroc, auprc, brier, nll, tp, fn, tn, fp, ap, an, pp, pn, n, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, mcc, fnlp, f1, f1_100, f1_50, f1_25, f1_10, f1_5, f1_3, f1_2, f2, f3, f5, f10, f25, f50, f100 ] print('saving performance statistics...', flush=False) datasetIO.save_datamatrix( '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(save_folder, stage), stat_subset[stage]) print('printing performance statistics...', flush=False) print('\t'.join(['stage', stat_subset[stage].rowname] + stat_subset[stage].columnlabels.tolist()), flush=False) for stat, vals in zip(stat_subset[stage].rowlabels, stat_subset[stage].matrix): print('\t'.join([stage, stat] + ['{0:1.3g}'.format(v) for v in vals]), flush=False) print('saving evaluation statistic...', flush=False) objective = stat_subset['validation'].select(evaluation_statistic, 'predict') with open('{0}/output.json'.format(save_folder), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(objective, fw, indent=2) print('done logistic_regression.py', flush=False)
def main(study_name='your_study'): # load the data orientation = 'fat' partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}.pickle'.format(orientation, partition)) if 'all' not in dataset: dataset['all'] = copy.deepcopy(dataset[partition]) else: dataset['all'].append(dataset[partition], 0) dataset[study_name] = {} for partition in partitions: dataset[study_name][partition] = datasetIO.load_datamatrix('data/prepared_data/{0}/{1}/{2}.pickle'.format(study_name, orientation, partition)) if 'all' not in dataset[study_name]: dataset[study_name]['all'] = copy.deepcopy(dataset[study_name][partition]) else: dataset[study_name]['all'].append(dataset[study_name][partition], 0) partitions.append('all') # create output directories if not os.path.exists('results'): os.mkdir('results') if not os.path.exists('results/sdae_features'): os.mkdir('results/sdae_features') if not os.path.exists('results/sdae_features/{0}'.format(study_name)): os.mkdir('results/sdae_features/{0}'.format(study_name)) if not os.path.exists('results/sdae_features/{0}/{1}'.format(study_name, orientation)): os.mkdir('results/sdae_features/{0}/{1}'.format(study_name, orientation)) # load the model activation_function, activation_function_name = (relu, 'relu') with open('results/autoencoder/fat/ns5_last2_first0.05_5layers_relu_variables.pickle', 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode # get embeddings and reconstructions sdae = {} for partition in partitions: sdae[partition] = {} sdae[partition]['recon'], sdae[partition]['embed'], sdae[partition]['error'] = sdae_reconstruction(dataset[partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True) print('{0} error: {1:1.3g}'.format(partition, sdae[partition]['error'])) sdae[study_name] = {} for partition in partitions: sdae[study_name][partition] = {} sdae[study_name][partition]['recon'], sdae[study_name][partition]['embed'], sdae[study_name][partition]['error'] = sdae_reconstruction(dataset[study_name][partition], W, Be, Bd, activation=activation_function, apply_activation_to_output=False, return_embedding=True, return_reconstruction_error=True) print('{0} {1} error: {2:1.3g}'.format(study_name, partition, sdae[study_name][partition]['error'])) # visualize embedding if sdae['all']['embed'].shape[1] < 5: for nx in range(sdae['all']['embed'].shape[1]-1): for ny in range(nx+1, sdae['all']['embed'].shape[1]): #tissues = np.unique(dataset['all'].rowmeta['general_tissue']) tissues = ['Adipose Tissue', 'Adrenal Gland', 'Blood', 'Blood Vessel', 'Brain', 'Breast', 'Colon', 'Esophagus', 'Heart', 'Kidney', 'Liver', 'Lung', 'Muscle', 'Nerve', 'Ovary', 'Pancreas', 'Pituitary', 'Prostate', 'Salivary Gland', 'Skin', 'Small Intestine', 'Spleen', 'Stomach', 'Testis', 'Thyroid', 'Uterus', 'V****a'] tissue_abbrevs = ['AT', 'AG', 'B', 'BV', 'Bn', 'Bt', 'C', 'E', 'H', 'K', 'Lr', 'Lg', 'M', 'N', 'O', 'Ps', 'Py', 'Pe', 'SG', 'Sk', 'SI', 'Sp', 'St', 'Ts', 'Td', 'U', 'V'] cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(tissues))) for i in range(len(tissues))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for tissue, tissue_abbrev, color in zip(tissues, tissue_abbrevs, colors): if tissue == '-666': continue # zorder = 0 # alpha = 0.05 # color = 'k' else: zorder = 1 alpha = 0.5 hit = dataset['all'].rowmeta['general_tissue'] == tissue hidxs = hit.nonzero()[0] # ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) ax.plot(sdae['all']['embed'].matrix[hit,nx], sdae['all']['embed'].matrix[hit,ny], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=0.2, markeredgewidth=0, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) for hidx in hidxs: ax.text(sdae['all']['embed'].matrix[hidx,nx], sdae['all']['embed'].matrix[hidx,ny], tissue_abbrev, horizontalalignment='center', verticalalignment='center', fontsize=4, color=color, alpha=alpha, zorder=zorder, label='{0}, {1}'.format(tissue_abbrev, tissue)) ax.plot(sdae[study_name]['all']['embed'].matrix[:,nx], sdae[study_name]['all']['embed'].matrix[:,ny], linestyle='None', linewidth=0, marker='x', markerfacecolor='k', markeredgecolor='k', markersize=0.2, markeredgewidth=0, alpha=1, zorder=1, label=study_name) for hidx in range(sdae[study_name]['all']['embed'].shape[0]): ax.text(sdae[study_name]['all']['embed'].matrix[hidx,nx], sdae[study_name]['all']['embed'].matrix[hidx,ny], 'X', horizontalalignment='center', verticalalignment='center', fontsize=4, color='k', alpha=1, zorder=1, label=study_name) ax.set_xlim(sdae['all']['embed'].matrix[:,nx].min(), sdae['all']['embed'].matrix[:,nx].max()) ax.set_ylim(sdae['all']['embed'].matrix[:,ny].min(), sdae['all']['embed'].matrix[:,ny].max()) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=40, fontsize=8, labelspacing=0.25) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600) ax.set_xlim(sdae[study_name]['all']['embed'].matrix[:,nx].min(), sdae[study_name]['all']['embed'].matrix[:,nx].max()) ax.set_ylim(sdae[study_name]['all']['embed'].matrix[:,ny].min(), sdae[study_name]['all']['embed'].matrix[:,ny].max()) fg.savefig('results/sdae_features/{0}/{1}/sdae2d_{2}_coloredby_general_tissue_x{3!s}_y{4!s}_zoom.png'.format(study_name, orientation, activation_function_name, nx, ny), transparent=True, pad_inches=0, dpi=600) plt.close() # save embedding datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae2d_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['embed']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.txt.gz'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon']) datasetIO.save_datamatrix('results/sdae_features/{0}/{1}/sdae_reconstructions_{2}_datamatrix.pickle'.format(study_name, orientation, activation_function_name), sdae[study_name]['all']['recon'])
""" import sys sys.path.append('../../utilities') import numpy as np import datasetIO import mapper import os import shutil import pickle # load the data print('loading data...', flush=True) print('values are p-values with non-significant associations (pvalue > 1e-4) imputed with pvalue=1', flush=True) gene_atb = datasetIO.load_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed.csv.gz', delimiter=',', getmetadata=False) # (3455, 295) gene_atb.rowname = 'mgd_id' gene_atb.columnname = 'mp_id' gene_atb.matrixname = 'gene_phenotype_associations_from_impc' # threshold the data print('thresholding data...', flush=True) print('because significant associations have p-value 1e-4 or less, perhaps relative p-values are not informative and better to threshold', flush=True) gene_atb.matrix = np.float64(gene_atb.matrix < 1) gene_atb.matrixname += '_thresholded' print('matrix sparsity: {0!s}, row median sparsity: {1!s}, column median sparsity: {2!s}'.format(gene_atb.matrix.sum()/gene_atb.size, np.median(gene_atb.matrix.sum(1)/gene_atb.shape[1]), np.median(gene_atb.matrix.sum(0)/gene_atb.shape[0])), flush=True) # save thresholded data print('saving thresholded data...', flush=True) datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.pickle', gene_atb) datasetIO.save_datamatrix('../../original_data/impc/mousegeneid_mousephenotypeid_datamatrix_trimmed_thresholded.txt.gz', gene_atb)
def main(study_name='your_study'): # load your data and create datamatrix object with open('data/original_data/{0}/ensembl_gene_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: ensembl_gene_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') with open('data/original_data/{0}/sample_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: sample_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') counts_matrix = np.loadtxt( 'data/original_data/{0}/expression_matrix.txt.gz'.format(study_name), dtype='float64', delimiter='\t', ndmin=2) total_counts_per_sample = counts_matrix.sum(0) gene_sample = dataclasses.datamatrix( rowname='ensembl_gene_id', rowlabels=ensembl_gene_ids, rowmeta={}, columnname='sample_id', columnlabels=sample_ids, columnmeta={'total_counts': total_counts_per_sample}, matrixname='rnaseq_gene_counts_from_{0}'.format(study_name), matrix=counts_matrix) del ensembl_gene_ids, sample_ids, counts_matrix, total_counts_per_sample # scale counts gene_sample.matrix = np.exp( np.log(gene_sample.matrix) - np.log(gene_sample.columnmeta['total_counts'].reshape(1, -1)) + (np.log(4) + 7 * np.log(10))) gene_sample.matrixname = 'rnaseq_scaled_counts_from_{0}'.format(study_name) # shuffle the data gene_sample.reorder(np.random.permutation(gene_sample.shape[0]), 0) gene_sample.reorder(np.random.permutation(gene_sample.shape[1]), 1) print(gene_sample) # load the reference data gene_sample_ref = datasetIO.load_datamatrix( 'data/prepared_data/fat/train.pickle').totranspose() print(gene_sample_ref) # align genes tobediscarded = ~np.in1d(gene_sample.rowlabels, gene_sample_ref.rowmeta['ensembl_gene_id']) gene_sample.discard(tobediscarded, 0) missing_ensembl_ids = gene_sample_ref.rowmeta['ensembl_gene_id'][~np.in1d( gene_sample_ref.rowmeta['ensembl_gene_id'], gene_sample.rowlabels)] gene_sample = gene_sample.tolabels( rowlabels=gene_sample_ref.rowmeta['ensembl_gene_id'].copy(), columnlabels=[]) gene_sample.rowlabels = gene_sample_ref.rowlabels.copy() gene_sample.rowname = gene_sample_ref.rowname for k, v in gene_sample_ref.rowmeta.items(): gene_sample.rowmeta[k] = v.copy() gene_sample.rowmeta['is_missing'] = np.in1d( gene_sample.rowmeta['ensembl_gene_id'], missing_ensembl_ids) gene_sample.rowmeta['all_zero'] = (gene_sample.matrix == 0).all(1) print('missing data for {0!s} genes'.format( gene_sample.rowmeta['is_missing'].sum())) print('no counts for {0!s} genes'.format( gene_sample.rowmeta['all_zero'].sum())) print(gene_sample) # handle zeros nonzeromins = np.zeros(gene_sample.shape[1], dtype='float64') for j in range(gene_sample.shape[1]): nonzeromins[j] = gene_sample.matrix[gene_sample.matrix[:, j] > 0, j].min() gene_sample.matrix[gene_sample.matrix[:, j] == 0, j] = nonzeromins[j] / 2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # log2 gene_sample.matrix = np.log2(gene_sample.matrix) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # normalize samples median_shift_from_median = np.median( gene_sample.matrix - gene_sample.rowmeta['median_sample_ref'].reshape(-1, 1), 0) gene_sample.matrix -= median_shift_from_median.reshape(1, -1) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # standardize the data gene_sample.matrix = ( gene_sample.matrix - gene_sample.rowmeta['row_mean_ref'].reshape( -1, 1)) / gene_sample.rowmeta['row_stdv_ref'].reshape(-1, 1) # handle missing genes gene_sample.matrix[gene_sample.rowmeta['is_missing'], :] = 0 # gene_sample.matrix[gene_sample.rowmeta['is_missing'],:] = gene_sample_ref.matrix[gene_sample.rowmeta['is_missing'],:].min(1, keepdims=True)/2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(gene_sample.matrix[:5,:].T, 10) # plt.figure(); plt.hist(gene_sample.matrix.reshape(-1), 1000) # transpose the data atb_gene = gene_sample.totranspose() # split the data test_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([test_fraction * gene_sample.shape[0], 2.0])) gene_sample_test = gene_sample.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([valid_fraction * gene_sample.shape[0], 2.0])) gene_sample_valid = gene_sample.pop(tobepopped, 0) gene_sample_train = gene_sample del gene_sample, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/skinny'.format(study_name)): os.mkdir('data/prepared_data/{0}/skinny'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/test.pickle'.format(study_name), gene_sample_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/valid.pickle'.format(study_name), gene_sample_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/train.pickle'.format(study_name), gene_sample_train) del gene_sample_test, gene_sample_valid, gene_sample_train # split the data test_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([test_fraction * atb_gene.shape[0], 2.0])) atb_gene_test = atb_gene.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([valid_fraction * atb_gene.shape[0], 2.0])) atb_gene_valid = atb_gene.pop(tobepopped, 0) atb_gene_train = atb_gene del atb_gene, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/fat'.format(study_name)): os.mkdir('data/prepared_data/{0}/fat'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/test.pickle'.format(study_name), atb_gene_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/valid.pickle'.format(study_name), atb_gene_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/train.pickle'.format(study_name), atb_gene_train)
def main(d): # d is a dictionary containing the auto-encoder design specifications and training phase specifications # RESET DEFAULT GRAPH print('resetting default graph...', flush=True) tf.reset_default_graph() # FINISH CONFIGURATION print('finishing configuration...', flush=True) # specify noise distribution if d['noise_distribution'] == 'truncnorm': noise_distribution = tf.truncated_normal elif d['noise_distribution'] == 'uniform': noise_distribution = tf.random_uniform # specify distribution of initial weights if d['initialization_distribution'] == 'truncnorm': initialization_distribution = tf.truncated_normal # specify activation function if d['activation_function'] == 'tanh': activation_function = {'tf': tf.tanh, 'np': sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = { 'tf': tf.nn.relu, 'np': sdae_apply_functions.relu } elif d['activation_function'] == 'elu': activation_function = {'tf': tf.nn.elu, 'np': sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = { 'tf': tf.sigmoid, 'np': sdae_apply_functions.sigmoid } # load data partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix('{0}/{1}.pickle'.format( d['input_path'], partition)) d['{0}_examples'.format(partition)] = dataset[partition].shape[0] # create output directory if not os.path.exists(d['output_path']): os.makedirs(d['output_path']) # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d[ 'all_dimensions']: d['current_apply_activation_to_embedding'] = False else: d['current_apply_activation_to_embedding'] = True # initialize assignments of training examples to mini-batches and number of training steps for stochastic gradient descent d['batch_size'] = d['batch_fraction'] * d['train_examples'] batch_ids = create_batch_ids(d['train_examples'], d['batch_size']) d['batches'] = np.unique(batch_ids).size d['steps'] = d['current_epochs'] * d['batches'] # specify path to weights from previous training run d['previous_variables_path'] = '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['previous_hidden_layer'], d['previous_finetuning_run']) d['fix_or_init'] = 'fix' if d[ 'current_finetuning_run'] == 0 else 'init' # fix for pretraining, init for finetuning # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, d['valid_examples']]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # print some design information print('input path: {0}'.format(d['input_path']), flush=True) print('output path: {0}'.format(d['output_path']), flush=True) print('previous variables path: {0}'.format(d['previous_variables_path']), flush=True) print('previous variables fix or init: {0}'.format(d['fix_or_init']), flush=True) # SAVE CURRENT DESIGN print('saving current design...', flush=True) with open('{0}/design_layer{1!s}_finetuning{2!s}.json'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(d, fw, indent=2) # DEFINE REPORTING VARIABLES print('defining reporting variables...', flush=True) reporting_steps = sdae_design_functions.create_reporting_steps( d['steps'], d['firstcheckpoint'], d['maxstepspercheckpoint']) valid_losses = np.zeros(reporting_steps.size, dtype='float32') train_losses = np.zeros(reporting_steps.size, dtype='float32') valid_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') train_noisy_losses = np.zeros(reporting_steps.size, dtype='float32') print('reporting steps:', reporting_steps, flush=True) # DEFINE COMPUTATIONAL GRAPH # define placeholders for input data, use None to allow feeding different numbers of examples print('defining placeholders...', flush=True) noise_stdv = tf.placeholder(tf.float32, []) noise_prob = tf.placeholder(tf.float32, []) training_and_validation_data_initializer = tf.placeholder( tf.float32, [ dataset['train'].shape[0] + dataset['valid'].shape[0], dataset['train'].shape[1] ]) selection_mask = tf.placeholder( tf.bool, [dataset['train'].shape[0] + dataset['valid'].shape[0]]) # define variables # W contains the weights, bencode contains the biases for encoding, and bdecode contains the biases for decoding print('defining variables...', flush=True) training_and_validation_data = tf.Variable( training_and_validation_data_initializer, trainable=False, collections=[]) if os.path.exists(d['previous_variables_path']): # update variables (if continuing from a previous training run) print('loading previous variables...', flush=True) global_step, W, bencode, bdecode = update_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma'], d['previous_variables_path'], d['fix_or_init'], d['include_global_step']) elif d['current_hidden_layer'] == 1 and d['current_finetuning_run'] == 0: # create variables global_step, W, bencode, bdecode = create_variables( d['current_dimensions'], initialization_distribution, d['initialization_sigma']) else: raise ValueError('could not find previous variables') # define model # h contains the activations from input layer to bottleneck layer # hhat contains the activations from bottleneck layer to output layer # xhat is a reference to the output layer (i.e. the reconstruction) print('defining model...', flush=True) x = tf.boolean_mask(training_and_validation_data, selection_mask) if d['noise_distribution'] == 'truncnorm': noise = noise_distribution(tf.shape(x), stddev=noise_stdv) else: noise = noise_distribution(tf.shape(x), minval=-noise_stdv, maxval=noise_stdv) noise_mask = tf.to_float(tf.random_uniform(tf.shape(x)) <= noise_prob) xnoisy = apply_noise(x, noise, noise_mask, d['noise_operation']) h, hhat, xhat = create_autoencoder( xnoisy, activation_function['tf'], d['apply_activation_to_output'], d['current_apply_activation_to_embedding'], W, bencode, bdecode) # define loss print('defining loss...', flush=True) loss = tf.reduce_mean(tf.squared_difference(x, xhat)) # squared error loss # define optimizer and training function print('defining optimizer and training function...', flush=True) optimizer = tf.train.AdamOptimizer(learning_rate=d['learning_rate'], epsilon=d['epsilon'], beta1=d['beta1'], beta2=d['beta2']) train_fn = optimizer.minimize(loss, global_step=global_step) # define bottleneck layer preactivation # bottleneck_preactivation = tf.matmul(h[-2], W[-1]) + bencode[-1] # INITIALIZE TENSORFLOW SESSION print('initializing tensorflow session...', flush=True) init = tf.global_variables_initializer() session_config = configure_session(d['processor'], d['gpu_memory_fraction']) with tf.Session(config=session_config) as sess: sess.run(init) # TRAINING print('training...', flush=True) sess.run(training_and_validation_data.initializer, feed_dict={ training_and_validation_data_initializer: np.append(dataset['train'].matrix, dataset['valid'].matrix, 0) }) validation_id = -1 batch_and_validation_ids = np.full(dataset['train'].shape[0] + dataset['valid'].shape[0], validation_id, dtype=batch_ids.dtype) is_train = np.append(np.ones(dataset['train'].shape[0], dtype='bool'), np.zeros(dataset['valid'].shape[0], dtype='bool')) is_valid = ~is_train training_step = 0 i = 0 overfitting_score = 0 stopearly = False starttime = time.time() with open('{0}/log_layer{1!s}_finetuning{2!s}.txt'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), mode='wt', buffering=1) as fl: fl.write('\t'.join([ 'step', 'train_loss', 'valid_loss', 'train_noisy_loss', 'valid_noisy_loss', 'time' ]) + '\n') for epoch in range(d['current_epochs']): if stopearly: break # randomize assignment of training examples to batches np.random.shuffle(batch_ids) batch_and_validation_ids[is_train] = batch_ids for batch in range(d['batches']): training_step += 1 # select mini-batch selected = batch_and_validation_ids == batch # update weights sess.run(train_fn, feed_dict={ selection_mask: selected, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) # record training and validation errors if training_step == reporting_steps[i]: train_losses[i] = sess.run(loss, feed_dict={ selection_mask: is_train, noise_prob: 0, noise_stdv: 0 }) train_noisy_losses[i] = sess.run( loss, feed_dict={ selection_mask: is_train, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) valid_losses[i] = sess.run(loss, feed_dict={ selection_mask: is_valid, noise_prob: 0, noise_stdv: 0 }) valid_noisy_losses[i] = sess.run( loss, feed_dict={ selection_mask: is_valid, noise_prob: d['noise_probability'], noise_stdv: d['noise_sigma'] }) print( 'step:{0:1.6g}, train loss:{1:1.3g}, valid loss:{2:1.3g}, train noisy loss:{3:1.3g},valid noisy loss:{4:1.3g}, time:{5:1.6g}' .format(reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], time.time() - starttime), flush=True) fl.write('\t'.join([ '{0:1.6g}'.format(x) for x in [ reporting_steps[i], train_losses[i], valid_losses[i], train_noisy_losses[i], valid_noisy_losses[i], time.time() - starttime ] ]) + '\n') # save current weights, reconstructions, and projections if training_step >= d[ 'startsavingstep'] or training_step == reporting_steps[ -1]: with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], training_step), 'wb') as fw: pickle.dump( (sess.run(global_step), sess.run(W), sess.run(bencode), sess.run(bdecode)), fw) # stop early if overfitting if valid_losses[i] >= 1.01 * (np.insert( valid_losses[:i], 0, np.inf).min()): overfitting_score += 1 else: overfitting_score = 0 if overfitting_score == d['overfitting_score_max']: stopearly = True print('stopping early!', flush=True) break i += 1 # end tensorflow session print('closing tensorflow session...', flush=True) # ROLL BACK IF OVERFITTING if stopearly: print('rolling back...', flush=True) reporting_steps = reporting_steps[:i + 1] train_losses = train_losses[:i + 1] valid_losses = valid_losses[:i + 1] train_noisy_losses = train_noisy_losses[:i + 1] valid_noisy_losses = valid_noisy_losses[:i + 1] # selected_step = max([reporting_steps[i-d['overfitting_score_max']], d['startsavingstep']]) else: print('completed all training steps...', flush=True) # selected_step = reporting_steps[-1] selected_step = min([ max([reporting_steps[np.argmin(valid_losses)], d['startsavingstep']]), reporting_steps[-1] ]) print('selected step:{0}...'.format(selected_step), flush=True) # SAVE RESULTS print('saving results...', flush=True) with open( '{0}/optimization_path_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'wb') as fw: pickle.dump( { 'reporting_steps': reporting_steps, 'valid_losses': valid_losses, 'train_losses': train_losses, 'valid_noisy_losses': valid_noisy_losses, 'train_noisy_losses': train_noisy_losses }, fw) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): shutil.copyfile( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) else: shutil.move( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'])) with open( '{0}/variables_layer{1!s}_finetuning{2!s}.pickle'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) if d['current_dimensions'] == d['all_dimensions'] and ( not d['use_finetuning'] or d['current_finetuning_run'] > 0): datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.pickle'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_layer{2!s}_finetuning{3!s}.txt.gz'.format( d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_embedding_preactivation_layer{2!s}_finetuning{3!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run']), embed_preactivation[partition]) # PLOT LOSS print('plotting loss...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(3.25, 2.25)) ax.set_position([0.55 / 3.25, 0.45 / 2.25, 2.6 / 3.25, 1.7 / 2.25]) ax.semilogx(reporting_steps, train_losses, ':r', linewidth=1, label='train') ax.semilogx(reporting_steps, valid_losses, '-g', linewidth=1, label='valid') ax.semilogx(reporting_steps, train_noisy_losses, '--b', linewidth=1, label='train,noisy') ax.semilogx(reporting_steps, valid_noisy_losses, '-.k', linewidth=1, label='valid,noisy') ax.legend(loc='best', fontsize=8) ax.set_ylabel('loss', fontsize=8) ax.set_xlabel('steps (selected step:{0!s})'.format(selected_step), fontsize=8) ax.set_xlim(reporting_steps[0] - 1, reporting_steps[-1] + 1) # ax.set_ylim(0, 1) ax.tick_params(axis='both', which='major', left='on', right='on', bottom='on', top='off', labelleft='on', labelright='off', labelbottom='on', labeltop='off', labelsize=8) fg.savefig('{0}/optimization_path_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() # PLOT RECONSTRUCTIONS print('plotting reconstructions...', flush=True) x_valid = dataset['valid'].matrix[:d['reconstruction_rows'] * d['reconstruction_cols'], :] xr_valid = recon['valid'].matrix[:d['reconstruction_rows'] * d['reconstruction_cols'], :] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 3.25)) for i, ax in enumerate(axs.reshape(-1)): ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left='off', right='off', bottom='off', top='off', labelleft='off', labelright='off', labelbottom='off', labeltop='off', pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') fg.savefig('{0}/reconstructions_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=1200) plt.close() # PLOT 2D EMBEDDING if d['current_dimensions'][-1] == 2 and (not d['use_finetuning'] or d['current_finetuning_run'] > 0): print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig('{0}/embedding_layer{1!s}_finetuning{2!s}.png'.format( d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom='off', top='off', labelbottom='off', labeltop='off', left='off', right='off', labelleft='off', labelright='off', pad=4) ax.set_frame_on(False) fg.savefig( '{0}/embedding_preactivation_layer{1!s}_finetuning{2!s}.png'. format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run']), transparent=True, pad_inches=0, dpi=600) plt.close() print('done training phase.', flush=True) return d['current_hidden_layer'], d['current_finetuning_run'], d[ 'current_epochs']
def main(reconstructions_path): # read reconstructions print('reading reconstructions...', flush=True) designpath_selectedstep = [] with open(reconstructions_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: designpath_selectedstep.append( [x.strip() for x in line.split('\t')]) print('found {0!s} reconstructions...'.format( len(designpath_selectedstep)), flush=True) # evaluate reconstructions print('evaluating reconstructions...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix( '{0}/{1}.pickle'.format(d['input_path'], partition)) # if 'all' not in dataset: # dataset['all'] = copy.deepcopy(dataset[partition]) # else: # dataset['all'].append(dataset[partition], 0) # get parameters for marginal distributions # will sample from marginal distributions to impute missing values # for binary features, model as bernoulli (columnmeta['likelihood'] == 'bernoulli') # for other features, model as gaussian marginalprobabilities = ( 1 + np.nansum(dataset['train'].matrix, 0, keepdims=True)) / ( 2 + np.sum( ~np.isnan(dataset['train'].matrix), 0, keepdims=True) ) # posterior mean of beta-bernoulli with prior a=b=1 marginalstdvs = np.nanstd(dataset['train'].matrix, 0, keepdims=True) isbernoullimarginal = (dataset['train'].columnmeta['likelihood'] == 'bernoulli').astype('float64').reshape( 1, -1) # load reconstructions and evaluate performance recon = {} stat_cut = {} for partition in ['valid', 'test']: # partitions: print('working on partition {0}...'.format(partition), flush=True) print('loading reconstructions...', flush=True) recon[partition] = datasetIO.load_datamatrix( '{0}/{1}_intermediate_reconstructions_layer{2!s}_finetuning{3!s}_step{4!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step)) hitmat = np.logical_and(isbernoullimarginal.astype('bool'), np.isfinite(dataset[partition].matrix)) print('evaluating performance...', flush=True) # stat_cut[partition] = modelevaluation.get_classifier_performance_stats(dataset[partition].matrix[hitmat].astype('bool'), recon[partition].matrix[hitmat], uP=1000, classifier_stats='all', plot_curves=False, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01) # print('saving performance...', flush=True) # datasetIO.save_datamatrix('{0}/{1}_intermediate_performance_layer{2!s}_finetuning{3!s}_step{4!s}.pickle'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), stat_cut[partition]) # datasetIO.save_datamatrix('{0}/{1}_intermediate_performance_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz'.format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), stat_cut[partition]) with gzip.open( '{0}/{1}_intermediate_performance_per_row_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'wt') as fw: fw.write('\t'.join(['label', 'auroc', 'auprc', 'ap']) + '\n') for i, (hm, dm, rm, label) in enumerate( zip(hitmat, dataset[partition].matrix, recon[partition].matrix, dataset[partition].rowlabels)): if hm.any(): sc = modelevaluation.get_classifier_performance_stats( dm[hm].astype('bool'), rm[hm], uP=5000, classifier_stats='all', plot_curves=False, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01) fw.write('\t'.join([label] + [ '{0:1.5g}'.format(x) for x in [ sc.select('auroc', [])[0], sc.select('auprc', [])[0], sc.select('ap', [])[0] ] ]) + '\n') with gzip.open( '{0}/{1}_intermediate_performance_per_col_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'wt') as fw: fw.write('\t'.join(['label', 'auroc', 'auprc', 'ap']) + '\n') for i, (hm, dm, rm, label) in enumerate( zip(hitmat.T, dataset[partition].matrix.T, recon[partition].matrix.T, dataset[partition].columnlabels)): if hm.any(): sc = modelevaluation.get_classifier_performance_stats( dm[hm].astype('bool'), rm[hm], uP=5000, classifier_stats='all', plot_curves=False, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01) fw.write('\t'.join([label] + [ '{0:1.5g}'.format(x) for x in [ sc.select('auroc', [])[0], sc.select('auprc', [])[0], sc.select('ap', [])[0] ] ]) + '\n') # auroc = [] # auprc = [] # for i, (h, d, r) in enumerate(zip(hitmat.T, dataset[partition].matrix.T, recon[partition].matrix.T)): # if h.any(): # sc = modelevaluation.get_classifier_performance_stats(d[h].astype('bool'), r[h], uP=1000, classifier_stats='all', plot_curves=False, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01) # auroc.append(sc.select('auroc',[])[0]) # auprc.append(sc.select('auprc',[])[0]) print('done evaluate_tsdae_reconstructions.', flush=True)
def main(visualizations_path): # read visualizations print('reading visualizations...', flush=True) designpath_selectedstep = {} with open(visualizations_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: for line in fr: design_path, selected_step = [x.strip() for x in line.split('\t')] designpath_selectedstep[design_path] = int(selected_step) print('found {0!s} visualizations...'.format(len(designpath_selectedstep)), flush=True) # make visualizations print('making visualizations...', flush=True) for didx, (design_path, selected_step) in enumerate(designpath_selectedstep.items()): print('working on {0}...'.format(design_path), flush=True) print('selected step:{0!s}...'.format(selected_step), flush=True) # load design print('loading design...', flush=True) with open(design_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: d = json.load(fr) if 'apply_activation_to_embedding' not in d: # for legacy code d['apply_activation_to_embedding'] = True if 'use_batchnorm' not in d: # for legacy code d['use_batchnorm'] = False if 'skip_layerwise_training' not in d: # for legacy code d['skip_layerwise_training'] = False phase = d['training_schedule'][-1] d['current_hidden_layer'] = phase['hidden_layer'] d['current_finetuning_run'] = phase['finetuning_run'] d['current_epochs'] = phase['epochs'] # load data if didx == 0: print('loading data...', flush=True) partitions = ['train', 'valid', 'test'] dataset = {} for partition in partitions: dataset[partition] = datasetIO.load_datamatrix( '{0}/{1}.pickle'.format(d['input_path'], partition)) # finish configuration print('finishing configuration...', flush=True) # specify activation function if d['activation_function'] == 'tanh': activation_function = {'np': sdae_apply_functions.tanh} elif d['activation_function'] == 'relu': activation_function = {'np': sdae_apply_functions.relu} elif d['activation_function'] == 'elu': activation_function = {'np': sdae_apply_functions.elu} elif d['activation_function'] == 'sigmoid': activation_function = {'np': sdae_apply_functions.sigmoid} # initialize model architecture (number of layers and dimension of each layer) d['current_dimensions'] = d[ 'all_dimensions'][:d['current_hidden_layer'] + 1] # dimensions of model up to current depth # specify embedding function for current training phase # we want the option of skipping the embedding activation function to apply only to the full model # if not d['apply_activation_to_embedding'] and d['current_dimensions'] == d['all_dimensions']: # d['current_apply_activation_to_embedding'] = False # else: # d['current_apply_activation_to_embedding'] = True if d['current_dimensions'] == d['all_dimensions']: if d['apply_activation_to_embedding']: d['current_apply_activation_to_embedding'] = True use_softmax = True else: d['current_apply_activation_to_embedding'] = False use_softmax = False else: d['current_apply_activation_to_embedding'] = True use_softmax = False print('current_apply_activation_to_embedding: {0!s}'.format( d['current_apply_activation_to_embedding']), flush=True) print('use_softmax: {0!s}'.format(use_softmax), flush=True) # specify rows and columns of figure showing data reconstructions d['reconstruction_rows'] = int( np.round(np.sqrt(np.min([100, dataset['valid'].shape[0]]) / 2))) d['reconstruction_cols'] = 2 * d['reconstruction_rows'] # load model variables print('loading model variables...', flush=True) with open( '{0}/intermediate_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'rb') as fr: W, Be, Bd = pickle.load(fr)[1:] # global_step, W, bencode, bdecode if d['use_batchnorm']: with open( '{0}/intermediate_batchnorm_variables_layer{1!s}_finetuning{2!s}_step{3!s}.pickle' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), 'rb') as fr: batchnorm_variables = pickle.load( fr) # gammas, betas, moving_means, moving_variances batchnorm_encode_variables, batchnorm_decode_variables = sdae_apply_functions.align_batchnorm_variables( batchnorm_variables, d['current_apply_activation_to_embedding'], d['apply_activation_to_output']) # compute embedding and reconstruction print('computing embedding and reconstruction...', flush=True) recon = {} embed = {} error = {} embed_preactivation = {} for partition in partitions: if d['use_batchnorm']: # recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) # embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, bn_variables=batchnorm_encode_variables) recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], use_softmax, d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True, bn_encode_variables=batchnorm_encode_variables, bn_decode_variables=batchnorm_decode_variables) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, use_softmax=use_softmax, bn_variables=batchnorm_encode_variables) else: # recon[partition], embed[partition], error[partition] = sdae_apply_functions.encode_and_decode(dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) # embed_preactivation[partition] = sdae_apply_functions.encode(dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False) recon[partition], embed[partition], error[ partition] = sdae_apply_functions.encode_and_decode( dataset[partition], W, Be, Bd, activation_function['np'], d['current_apply_activation_to_embedding'], use_softmax, d['apply_activation_to_output'], return_embedding=True, return_reconstruction_error=True) embed_preactivation[partition] = sdae_apply_functions.encode( dataset[partition], W, Be, activation_function['np'], apply_activation_to_embedding=False, use_softmax=use_softmax) print('{0} reconstruction error: {1:1.3g}'.format( partition, error[partition]), flush=True) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed[partition]) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed[partition]) if d['current_apply_activation_to_embedding']: datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.pickle' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed_preactivation[partition]) datasetIO.save_datamatrix( '{0}/{1}_intermediate_embedding_preactivation_layer{2!s}_finetuning{3!s}_step{4!s}.txt.gz' .format(d['output_path'], partition, d['current_hidden_layer'], d['current_finetuning_run'], selected_step), embed_preactivation[partition]) # plot reconstructions print('plotting reconstructions...', flush=True) num_recons = min([ d['reconstruction_rows'] * d['reconstruction_cols'], dataset['valid'].shape[0] ]) x_valid = dataset['valid'].matrix[:num_recons, :] xr_valid = recon['valid'].matrix[:num_recons, :] if x_valid.shape[1] > 1000: x_valid = x_valid[:, :1000] xr_valid = xr_valid[:, :1000] lb = np.append(x_valid, xr_valid, 1).min(1) ub = np.append(x_valid, xr_valid, 1).max(1) fg, axs = plt.subplots(d['reconstruction_rows'], d['reconstruction_cols'], figsize=(6.5, 3.25)) for i, ax in enumerate(axs.reshape(-1)): if i < num_recons: ax.plot(x_valid[i, :], xr_valid[i, :], 'ok', markersize=0.5, markeredgewidth=0) ax.set_ylim(lb[i], ub[i]) ax.set_xlim(lb[i], ub[i]) ax.tick_params(axis='both', which='major', left=False, right=False, bottom=False, top=False, labelleft=False, labelright=False, labelbottom=False, labeltop=False, pad=4) ax.set_frame_on(False) ax.axvline(lb[i], linewidth=1, color='k') ax.axvline(ub[i], linewidth=1, color='k') ax.axhline(lb[i], linewidth=1, color='k') ax.axhline(ub[i], linewidth=1, color='k') else: fg.delaxes(ax) fg.savefig( '{0}/intermediate_reconstructions_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=1200) plt.close() # plot 2d embedding if d['current_dimensions'][-1] == 2: print('plotting 2d embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed['train'].matrix[:, 0], embed['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed['valid'].matrix[:, 0], embed['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/intermediate_embedding_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=600) plt.close() if d['current_apply_activation_to_embedding']: fg, ax = plt.subplots(1, 1, figsize=(6.5, 6.5)) ax.set_position([0.15 / 6.5, 0.15 / 6.5, 6.2 / 6.5, 6.2 / 6.5]) ax.plot(embed_preactivation['train'].matrix[:, 0], embed_preactivation['train'].matrix[:, 1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.plot(embed_preactivation['valid'].matrix[:, 0], embed_preactivation['valid'].matrix[:, 1], 'or', markersize=2, markeredgewidth=0, alpha=1.0, zorder=1) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False, pad=4) ax.set_frame_on(False) fg.savefig( '{0}/intermediate_embedding_preactivation_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), transparent=True, pad_inches=0, dpi=600) plt.close() # plot heatmap else: print('plotting embedding heatmap...', flush=True) for partition in partitions: if 'all' not in embed: embed['all'] = copy.deepcopy(embed[partition]) else: embed['all'].append(embed[partition], 0) embed['all'].cluster('all', 'cosine', 'average') embed['all'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/intermediate_embedding_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), closefigure=True, dpi=300) if d['current_apply_activation_to_embedding']: for partition in partitions: if 'all' not in embed_preactivation: embed_preactivation['all'] = copy.deepcopy( embed_preactivation[partition]) else: embed_preactivation['all'].append( embed_preactivation[partition], 0) embed_preactivation['all'].cluster('all', 'cosine', 'average') embed_preactivation['all'].heatmap( rowmetalabels=[], columnmetalabels=[], normalize=False, standardize=False, normalizebeforestandardize=True, cmap_name='bwr', ub=None, lb=None, savefilename= '{0}/intermediate_embedding_preactivation_heatmap_layer{1!s}_finetuning{2!s}_step{3!s}.png' .format(d['output_path'], d['current_hidden_layer'], d['current_finetuning_run'], selected_step), closefigure=True, dpi=300) print('done get_sdae_features.', flush=True)
def main(model_folders_path): print('reading list of model folders...', flush=True) with open(model_folders_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: model_folders = fr.read().split('\n') # if '_v' in model_folders_path: # version = model_folders_path.replace('.txt', '').split('_')[-1] print('loading input datamatrix...', flush=True) model_folder_parts = model_folders[0].split('/') dataset_name = model_folder_parts[model_folder_parts.index('hp_search')+1] observed_ = datasetIO.load_datamatrix('../../input_data/{0}/datamatrix.pickle'.format(dataset_name)) print(observed_, flush=True) print('attaching hla types...', flush=True) columnlabel_idx = {l:i for i,l in enumerate(observed_.columnlabels)} hla_types_df = pd.read_csv('../../original_data/1000genomes/20140702_hla_diversity.csv', index_col=False) for metalabel in hla_types_df.columns.values[1:]: observed_.columnmeta[metalabel] = np.full(observed_.shape[1], 'NA', dtype='object') for columnlabel, value in zip(hla_types_df['id'].values, hla_types_df[metalabel].values): if columnlabel in columnlabel_idx: columnidx = columnlabel_idx[columnlabel] observed_.columnmeta[metalabel][columnidx] = value uvals, counts = np.unique(observed_.columnmeta[metalabel], return_counts=True) max_num_uvals = 25 if uvals.size > max_num_uvals: si = np.argsort(counts)[::-1] low_freq_uvals = uvals[si[max_num_uvals:]] observed_.columnmeta[metalabel][np.in1d(observed_.columnmeta[metalabel], low_freq_uvals)] = 'NA' for model_folder in model_folders: print('working on model_folder: {0}...'.format(model_folder), flush=True) input_path = '{0}/embedding.csv.gz'.format(model_folder) output_folder = '/'.join(model_folder.replace('/hp_search/', '/output_data/').split('/')[:-1]) + '/embeddings' if not os.path.exists(output_folder): os.makedirs(output_folder) output_path_prefix = '{0}/{1}'.format(output_folder, model_folder.split('/')[-1]) print('input_path: {0}'.format(input_path), flush=True) print('output_folder: {0}'.format(output_folder), flush=True) print('output_path_prefix: {0}'.format(output_path_prefix), flush=True) if os.path.exists(input_path): print('loading embedding datamatrix...', flush=True) df = pd.read_csv(input_path, index_col=False, usecols=[observed_.rowname, 'Latent1', 'Latent2']) hidden = dc.datamatrix(rowname=observed_.rowname, rowlabels=df[observed_.rowname].values, rowmeta={}, columnname='latent_component', columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'), columnmeta={}, matrixname=observed_.rowname + '_embedding_from_' + observed_.matrixname, matrix=np.concatenate((df.Latent1.values.reshape(-1,1), df.Latent2.values.reshape(-1,1)), 1)) del df print(hidden, flush=True) print('aligning input datamatrix and embedding datamatrix...', flush=True) if observed_.shape[0] == hidden.shape[0] and (observed_.rowlabels == hidden.rowlabels).all(): observed = copy.deepcopy(observed_) else: observed = observed_.tolabels(rowlabels=hidden.rowlabels.copy()) hidden.rowmeta = copy.deepcopy(observed.rowmeta) print(observed, flush=True) # visualization print('plotting embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.plot(hidden.matrix[:,0], hidden.matrix[:,1], 'ok', markersize=1, markeredgewidth=0, alpha=0.5, zorder=0) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in ['mean', 'stdv', 'position']: z = hidden.rowmeta[metalabel].astype('float64') fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.scatter(hidden.matrix[:,0], hidden.matrix[:,1], s=1, c=z, marker='o', edgecolors='none', cmap=plt.get_cmap('jet'), alpha=0.5, vmin=z.min(), vmax=z.max()) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in ['gene_name']: categories = np.unique(hidden.rowmeta[metalabel]) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = hidden.rowmeta[metalabel] == category ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() hla_hit = np.array(['HLA-' in x for x in hidden.rowmeta['gene_name']], dtype='bool') hla_names = hidden.rowmeta['gene_name'].copy() hla_names[~hla_hit] = 'NA' categories = np.unique(hla_names) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = hla_names == category ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=1, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_hlagene.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() print('computing right factor matrix...', flush=True) rightfactormat, residuals, rank, singular_values = np.linalg.lstsq(hidden.matrix, observed.matrix) factored = dc.datamatrix(rowname=observed.columnname, rowlabels=observed.columnlabels.copy(), rowmeta=copy.deepcopy(observed.columnmeta), columnname='latent_component', columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'), columnmeta={}, matrixname=observed.columnname + '_embedding_from_' + observed.matrixname, matrix=rightfactormat.T) print('plotting transpose embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.plot(factored.matrix[:,0], factored.matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_transpose.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in factored.rowmeta: # ['population', 'super_population', 'gender']: categories = np.unique(factored.rowmeta[metalabel]) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = factored.rowmeta[metalabel] == category ax.plot(factored.matrix[hit,0], factored.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_transpose_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() print('done plot_embeddings.py', flush=True)