def encode_and_decode(dm, W, Be, Bd, activation, apply_activation_to_embedding=False, use_softmax=False, apply_activation_to_output=False, return_embedding=False, return_reconstruction_error=False, bn_encode_variables=None, bn_decode_variables=None): mat = dm.matrix.copy() if bn_encode_variables == None: for i, (w, b) in enumerate(zip(W, Be)): if i+1 < len(W): mat = activation(mat.dot(w) + b) else: if apply_activation_to_embedding: if use_softmax: mat = softmax(mat.dot(w) + b) else: mat = activation(mat.dot(w) + b) else: mat = mat.dot(w) + b else: gammas, betas, moving_means, moving_variances = bn_encode_variables for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W, Be, gammas, betas, moving_means, moving_variances)): if i+1 < len(W) or apply_activation_to_embedding: mat = activation(batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance)) else: mat = batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance) if return_embedding: em = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname='latent_component', columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={}, matrixname='sdae_encoding_of_'+dm.matrixname, matrix=mat.copy()) if bn_decode_variables == None: for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])): if i+1 < len(W) or apply_activation_to_output: mat = activation(mat.dot(w.T) + b) else: mat = mat.dot(w.T) + b else: gammas, betas, moving_means, moving_variances = bn_decode_variables for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W[::-1], Bd[::-1], gammas, betas, moving_means, moving_variances)): if i+1 < len(W) or apply_activation_to_output: mat = activation(batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance)) else: mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance) rm = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname=dm.columnname, columnlabels=dm.columnlabels.copy(), columnmeta=copy.deepcopy(dm.columnmeta), matrixname='decoding_from_sdae_encoding_of_'+dm.matrixname, matrix=mat) reconstruction_error = np.mean((rm.matrix - dm.matrix)**2) if return_embedding and return_reconstruction_error: return rm, em, reconstruction_error elif return_embedding: return rm, em elif return_reconstruction_error: return rm, reconstruction_error else: return rm
def load_splitdata(rowdatapath, columndatapath, matrixdatapath, studyname='', dtype='float64', delimiter='\t', matrix_has_labels=True): rowname, rowlabels, rowmeta = load_metadata(rowdatapath, delimiter) columnname, columnlabels, columnmeta = load_metadata( columndatapath, delimiter) if matrix_has_labels: matrix = np.loadtxt(matrixdatapath, dtype=dtype, delimiter=delimiter, skiprows=1, usecols=range(1, len(columnlabels) + 1), ndmin=2) else: matrix = np.loadtxt(matrixdatapath, dtype=dtype, delimiter=delimiter, ndmin=2) if studyname == '': studyname = matrixdatapath matrixname = '{0}-{1}_data_from_{2}'.format(rowname, columnname, studyname) return dc.datamatrix(rowname, rowlabels, columnname, columnlabels, matrixname, matrix, rowmeta, columnmeta)
def encode(dm, W, Be, activation, apply_activation_to_embedding=False, use_softmax=False, bn_variables=None): mat = dm.matrix.copy() if bn_variables == None: for i, (w, b) in enumerate(zip(W, Be)): if i+1 < len(W): mat = activation(mat.dot(w) + b) else: if apply_activation_to_embedding: if use_softmax: mat = softmax(mat.dot(w) + b) else: mat = activation(mat.dot(w) + b) else: mat = mat.dot(w) + b else: gammas, betas, moving_means, moving_variances = bn_variables for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate(zip(W, Be, gammas, betas, moving_means, moving_variances)): if i+1 < len(W) or apply_activation_to_embedding: mat = activation(batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance)) else: mat = batchnorm(mat.dot(w), gamma, beta, moving_mean, moving_variance) em = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname='latent_component', columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={}, matrixname='sdae_encoding_of_'+dm.matrixname, matrix=mat) return em
def sdae_reconstruction(dm, W, Be, Bd, activation, apply_activation_to_output=False, return_embedding=False, return_reconstruction_error=False): mat = dm.matrix.copy() for i, (w, b) in enumerate(zip(W, Be)): if i+1 < len(W) or apply_activation_to_output: mat = activation(mat.dot(w) + b) else: mat = mat.dot(w) + b if return_embedding: em = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname='latent_component', columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={'activation_applied':np.full(mat.shape[1], apply_activation_to_output, dtype='bool')}, matrixname='sdae_transform_of_'+dm.matrixname, matrix=mat.copy()) if not apply_activation_to_output: mat = activation(mat) for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])): if i+1 < len(W) or apply_activation_to_output: mat = activation(mat.dot(w.T) + b) else: mat = mat.dot(w.T) + b rm = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname='reconstructed_' + dm.columnname, columnlabels=dm.columnlabels.copy(), columnmeta=copy.deepcopy(dm.columnmeta), matrixname='reconstruction_from_sdae_transform_of_'+dm.matrixname, matrix=mat) reconstruction_error = np.mean((rm.matrix - dm.matrix)**2) if return_embedding and return_reconstruction_error: return rm, em, reconstruction_error elif return_embedding: return rm, em elif return_reconstruction_error: return rm, reconstruction_error else: return rm
def sdae_transform(dm, W, Be, activation, apply_activation_to_output=False): mat = dm.matrix.copy() for i, (w, b) in enumerate(zip(W, Be)): if i+1 < len(W) or apply_activation_to_output: mat = activation(mat.dot(w) + b) else: mat = mat.dot(w) + b em = dataclasses.datamatrix(rowname=dm.rowname, rowlabels=dm.rowlabels.copy(), rowmeta=copy.deepcopy(dm.rowmeta), columnname='latent_component', columnlabels=np.array(['LC'+str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={'activation_applied':np.full(mat.shape[1], apply_activation_to_output, dtype='bool')}, matrixname='sdae_transform_of_'+dm.matrixname, matrix=mat) return em
def decode(em, W, Bd, activation, apply_activation_to_output=False, output_activation_mask=[], bn_variables=None): mat = em.matrix.copy() if bn_variables == None: for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])): if i + 1 < len(W): mat = activation(mat.dot(w.T) + b) elif apply_activation_to_output: mat = mat.dot(w.T) + b mat[:, output_activation_mask] = activation( mat[:, output_activation_mask]) else: mat = mat.dot(w.T) + b else: gammas, betas, moving_means, moving_variances = bn_variables for i, (w, b, gamma, beta, moving_mean, moving_variance) in enumerate( zip(W[::-1], Bd[::-1], gammas, betas, moving_means, moving_variances)): if i + 1 < len(W): mat = activation( batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance)) elif apply_activation_to_output: mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance) mat[:, output_activation_mask] = activation( mat[:, output_activation_mask]) else: mat = batchnorm(mat.dot(w.T), gamma, beta, moving_mean, moving_variance) rm = dataclasses.datamatrix( rowname=em.rowname, rowlabels=em.rowlabels.copy(), rowmeta=copy.deepcopy(em.rowmeta), columnname='reconstructed_feature', columnlabels=np.array(['RF' + str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={}, matrixname='decoding_from_' + em.matrixname, matrix=mat) return rm
def sdae_inverse_transform(em, W, Bd, activation, apply_activation_to_output=False): if ~(em.columnmeta['activation_applied'].any()): mat = activation(em.matrix) else: mat = em.matrix.copy() for i, (w, b) in enumerate(zip(W[::-1], Bd[::-1])): if i+1 < len(W) or apply_activation_to_output: mat = activation(mat.dot(w.T) + b) else: mat = mat.dot(w.T) + b rm = dataclasses.datamatrix(rowname=em.rowname, rowlabels=em.rowlabels.copy(), rowmeta=copy.deepcopy(em.rowmeta), columnname='reconstructed_feature', columnlabels=np.array(['RF'+str(x) for x in range(mat.shape[1])], dtype='object'), columnmeta={}, matrixname='reconstruction_from_'+em.matrixname, matrix=mat) return rm
def main(study_name='your_study'): # load your data and create datamatrix object with open('data/original_data/{0}/ensembl_gene_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: ensembl_gene_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') with open('data/original_data/{0}/sample_ids.txt'.format(study_name), mode='rt', encoding='utf-8', errors='surrogateescape') as fr: sample_ids = np.array([x.strip() for x in fr.read().split('\n')], dtype='object') counts_matrix = np.loadtxt( 'data/original_data/{0}/expression_matrix.txt.gz'.format(study_name), dtype='float64', delimiter='\t', ndmin=2) total_counts_per_sample = counts_matrix.sum(0) gene_sample = dataclasses.datamatrix( rowname='ensembl_gene_id', rowlabels=ensembl_gene_ids, rowmeta={}, columnname='sample_id', columnlabels=sample_ids, columnmeta={'total_counts': total_counts_per_sample}, matrixname='rnaseq_gene_counts_from_{0}'.format(study_name), matrix=counts_matrix) del ensembl_gene_ids, sample_ids, counts_matrix, total_counts_per_sample # scale counts gene_sample.matrix = np.exp( np.log(gene_sample.matrix) - np.log(gene_sample.columnmeta['total_counts'].reshape(1, -1)) + (np.log(4) + 7 * np.log(10))) gene_sample.matrixname = 'rnaseq_scaled_counts_from_{0}'.format(study_name) # shuffle the data gene_sample.reorder(np.random.permutation(gene_sample.shape[0]), 0) gene_sample.reorder(np.random.permutation(gene_sample.shape[1]), 1) print(gene_sample) # load the reference data gene_sample_ref = datasetIO.load_datamatrix( 'data/prepared_data/fat/train.pickle').totranspose() print(gene_sample_ref) # align genes tobediscarded = ~np.in1d(gene_sample.rowlabels, gene_sample_ref.rowmeta['ensembl_gene_id']) gene_sample.discard(tobediscarded, 0) missing_ensembl_ids = gene_sample_ref.rowmeta['ensembl_gene_id'][~np.in1d( gene_sample_ref.rowmeta['ensembl_gene_id'], gene_sample.rowlabels)] gene_sample = gene_sample.tolabels( rowlabels=gene_sample_ref.rowmeta['ensembl_gene_id'].copy(), columnlabels=[]) gene_sample.rowlabels = gene_sample_ref.rowlabels.copy() gene_sample.rowname = gene_sample_ref.rowname for k, v in gene_sample_ref.rowmeta.items(): gene_sample.rowmeta[k] = v.copy() gene_sample.rowmeta['is_missing'] = np.in1d( gene_sample.rowmeta['ensembl_gene_id'], missing_ensembl_ids) gene_sample.rowmeta['all_zero'] = (gene_sample.matrix == 0).all(1) print('missing data for {0!s} genes'.format( gene_sample.rowmeta['is_missing'].sum())) print('no counts for {0!s} genes'.format( gene_sample.rowmeta['all_zero'].sum())) print(gene_sample) # handle zeros nonzeromins = np.zeros(gene_sample.shape[1], dtype='float64') for j in range(gene_sample.shape[1]): nonzeromins[j] = gene_sample.matrix[gene_sample.matrix[:, j] > 0, j].min() gene_sample.matrix[gene_sample.matrix[:, j] == 0, j] = nonzeromins[j] / 2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # log2 gene_sample.matrix = np.log2(gene_sample.matrix) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # normalize samples median_shift_from_median = np.median( gene_sample.matrix - gene_sample.rowmeta['median_sample_ref'].reshape(-1, 1), 0) gene_sample.matrix -= median_shift_from_median.reshape(1, -1) # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(((gene_sample.matrix[:5,:] - gene_sample.matrix[:5,:].mean(1, keepdims=True))/gene_sample.matrix[:5,:].std(1, ddof=1, keepdims=True)).T, 10) # standardize the data gene_sample.matrix = ( gene_sample.matrix - gene_sample.rowmeta['row_mean_ref'].reshape( -1, 1)) / gene_sample.rowmeta['row_stdv_ref'].reshape(-1, 1) # handle missing genes gene_sample.matrix[gene_sample.rowmeta['is_missing'], :] = 0 # gene_sample.matrix[gene_sample.rowmeta['is_missing'],:] = gene_sample_ref.matrix[gene_sample.rowmeta['is_missing'],:].min(1, keepdims=True)/2.0 # distributions # plt.figure(); plt.hist(gene_sample.matrix[:,:5], 50) # plt.figure(); plt.hist(gene_sample.matrix[:5,:].T, 10) # plt.figure(); plt.hist(gene_sample.matrix.reshape(-1), 1000) # transpose the data atb_gene = gene_sample.totranspose() # split the data test_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([test_fraction * gene_sample.shape[0], 2.0])) gene_sample_test = gene_sample.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(gene_sample.shape[0]) < round( max([valid_fraction * gene_sample.shape[0], 2.0])) gene_sample_valid = gene_sample.pop(tobepopped, 0) gene_sample_train = gene_sample del gene_sample, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/skinny'.format(study_name)): os.mkdir('data/prepared_data/{0}/skinny'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/test.pickle'.format(study_name), gene_sample_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/valid.pickle'.format(study_name), gene_sample_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/skinny/train.pickle'.format(study_name), gene_sample_train) del gene_sample_test, gene_sample_valid, gene_sample_train # split the data test_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([test_fraction * atb_gene.shape[0], 2.0])) atb_gene_test = atb_gene.pop(tobepopped, 0) valid_fraction = 0.1 tobepopped = np.random.permutation(atb_gene.shape[0]) < round( max([valid_fraction * atb_gene.shape[0], 2.0])) atb_gene_valid = atb_gene.pop(tobepopped, 0) atb_gene_train = atb_gene del atb_gene, tobepopped # save the data if not os.path.exists('data/prepared_data'): os.mkdir('data/prepared_data') if not os.path.exists('data/prepared_data/{0}'.format(study_name)): os.mkdir('data/prepared_data/{0}'.format(study_name)) if not os.path.exists('data/prepared_data/{0}/fat'.format(study_name)): os.mkdir('data/prepared_data/{0}/fat'.format(study_name)) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/test.pickle'.format(study_name), atb_gene_test) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/valid.pickle'.format(study_name), atb_gene_valid) datasetIO.save_datamatrix( 'data/prepared_data/{0}/fat/train.pickle'.format(study_name), atb_gene_train)
def main(): # load class examples print('loading class examples...', flush=True) class_examples_folder = 'targets/pharmaprojects' class_examples = { 'positive': datasetIO.load_examples( '{0}/positive.txt'.format(class_examples_folder)), 'negative': datasetIO.load_examples( '{0}/negative.txt'.format(class_examples_folder)), 'unknown': datasetIO.load_examples( '{0}/unknown.txt'.format(class_examples_folder)) } # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/harmonizome/dataset_info.txt' dataset_infos = datasetIO.load_datasetinfo(dataset_info_path) # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/candidate_features' if not os.path.exists(results_folder): os.mkdir(results_folder) # iterate over datasets print('iterating over datasets...', flush=True) for dataset_info in dataset_infos: # # just work with hpatissuesmrna for testing/debugging the pipeline # if dataset_info['abbreviation'] != 'hpatissuesmrna_cleaned': # print('skipping {0}. not in testing set...'.format(dataset_info['abbreviation']), flush=True) # continue # check if another python instance is already working on this dataset if os.path.exists('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation'])): print('skipping {0}. already in progress...'.format( dataset_info['abbreviation']), flush=True) continue # log start of processing with open('{0}/{1}_in_progress.txt'.format( results_folder, dataset_info['abbreviation']), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: print('working on {0}...'.format(dataset_info['abbreviation']), flush=True) fw.write('working on {0}...'.format(dataset_info['abbreviation'])) # load dataset print('loading dataset...', flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) dataset_info['original_genes'] = gene_atb.shape[0] dataset_info['original_features'] = gene_atb.shape[1] # decide feature normalization print('deciding feature normalization...', flush=True) if ('standardized' in dataset_info['abbreviation'] or 'cleaned' in dataset_info['abbreviation'] ) and (gene_atb.matrix == 0).sum() / gene_atb.size <= 0.5: # dataset is many-valued and filled-in print(' dataset is many-valued and filled-in...', flush=True) print(' z-scoring features...', flush=True) dataset_info['feature_normalization'] = 'z-score' mnv = np.nanmean(gene_atb.matrix, axis=0, keepdims=True) sdv = np.nanstd(gene_atb.matrix, axis=0, keepdims=True) gene_atb.matrix = (gene_atb.matrix - mnv) / sdv gene_atb.columnmeta['mean'] = mnv.reshape(-1) gene_atb.columnmeta['stdv'] = sdv.reshape(-1) else: # dataset is binary or tertiary or sparse print(' dataset is binary, tertiary, or sparse...', flush=True) print(' no feature normalization...', flush=True) dataset_info['feature_normalization'] = 'none' # assign class labels to genes print('assigning class labels to genes...', flush=True) gene_atb.rowmeta['class'] = np.full(gene_atb.shape[0], 'unknown', dtype='object') gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['positive']))] = 'positive' gene_atb.rowmeta['class'][np.in1d( gene_atb.rowlabels, list(class_examples['negative']))] = 'negative' # add dataset mean and stdv as features print('adding dataset mean and stdv as features...', flush=True) gene_stat = dataclasses.datamatrix( rowname=gene_atb.rowname, rowlabels=gene_atb.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb.rowmeta), columnname=gene_atb.columnname, columnlabels=np.array(['mean', 'stdv'], dtype='object'), columnmeta={}, matrixname=gene_atb.matrixname, matrix=np.append(gene_atb.matrix.mean(1, keepdims=True), gene_atb.matrix.std(1, keepdims=True), 1)) gene_atb.append(gene_stat, 1) gene_atb.columnmeta['isrowstat'] = np.in1d(gene_atb.columnlabels, gene_stat.columnlabels) del gene_stat # identify features with little information about labelled examples print( 'identifying features with little information about labelled examples...', flush=True) isunknown = gene_atb.rowmeta['class'] == 'unknown' tobediscarded = np.logical_or.reduce( ((gene_atb.matrix[~isunknown, :] != 0).sum(axis=0) < 3, (gene_atb.matrix[~isunknown, :] != 1).sum(axis=0) < 3, np.isnan(gene_atb.matrix[~isunknown, :]).any(axis=0))) if tobediscarded.any(): # discard features print(' discarding {0!s} features. {1!s} features remaining...'. format(tobediscarded.sum(), (~tobediscarded).sum()), flush=True) gene_atb.discard(tobediscarded, axis=1) else: # keep all features print(' no features to discard. {0!s} features remaining...'. format(gene_atb.shape[1]), flush=True) # save if dataset has content print('saving if dataset has content...', flush=True) if gene_atb.shape[0] == 0 or gene_atb.shape[1] == 0: # no content print(' nothing to save...', flush=True) else: # save candidate features print(' saving {0!s} candidate features...'.format( gene_atb.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['candidate_genes'] = gene_atb.shape[0] dataset_info['candidate_features'] = gene_atb.shape[1] dataset_info['positive_examples'] = ( gene_atb.rowmeta['class'] == 'positive').sum() dataset_info['negative_examples'] = ( gene_atb.rowmeta['class'] == 'negative').sum() dataset_info['unknown_examples'] = ( gene_atb.rowmeta['class'] == 'unknown').sum() datasetIO.save_datamatrix(dataset_info['path'], gene_atb) datasetIO.append_datasetinfo( '{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
def main(model_folders_path): print('reading list of model folders...', flush=True) with open(model_folders_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: model_folders = fr.read().split('\n') # if '_v' in model_folders_path: # version = model_folders_path.replace('.txt', '').split('_')[-1] print('loading input datamatrix...', flush=True) model_folder_parts = model_folders[0].split('/') dataset_name = model_folder_parts[model_folder_parts.index('hp_search')+1] observed_ = datasetIO.load_datamatrix('../../input_data/{0}/datamatrix.pickle'.format(dataset_name)) print(observed_, flush=True) print('attaching hla types...', flush=True) columnlabel_idx = {l:i for i,l in enumerate(observed_.columnlabels)} hla_types_df = pd.read_csv('../../original_data/1000genomes/20140702_hla_diversity.csv', index_col=False) for metalabel in hla_types_df.columns.values[1:]: observed_.columnmeta[metalabel] = np.full(observed_.shape[1], 'NA', dtype='object') for columnlabel, value in zip(hla_types_df['id'].values, hla_types_df[metalabel].values): if columnlabel in columnlabel_idx: columnidx = columnlabel_idx[columnlabel] observed_.columnmeta[metalabel][columnidx] = value uvals, counts = np.unique(observed_.columnmeta[metalabel], return_counts=True) max_num_uvals = 25 if uvals.size > max_num_uvals: si = np.argsort(counts)[::-1] low_freq_uvals = uvals[si[max_num_uvals:]] observed_.columnmeta[metalabel][np.in1d(observed_.columnmeta[metalabel], low_freq_uvals)] = 'NA' for model_folder in model_folders: print('working on model_folder: {0}...'.format(model_folder), flush=True) input_path = '{0}/embedding.csv.gz'.format(model_folder) output_folder = '/'.join(model_folder.replace('/hp_search/', '/output_data/').split('/')[:-1]) + '/embeddings' if not os.path.exists(output_folder): os.makedirs(output_folder) output_path_prefix = '{0}/{1}'.format(output_folder, model_folder.split('/')[-1]) print('input_path: {0}'.format(input_path), flush=True) print('output_folder: {0}'.format(output_folder), flush=True) print('output_path_prefix: {0}'.format(output_path_prefix), flush=True) if os.path.exists(input_path): print('loading embedding datamatrix...', flush=True) df = pd.read_csv(input_path, index_col=False, usecols=[observed_.rowname, 'Latent1', 'Latent2']) hidden = dc.datamatrix(rowname=observed_.rowname, rowlabels=df[observed_.rowname].values, rowmeta={}, columnname='latent_component', columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'), columnmeta={}, matrixname=observed_.rowname + '_embedding_from_' + observed_.matrixname, matrix=np.concatenate((df.Latent1.values.reshape(-1,1), df.Latent2.values.reshape(-1,1)), 1)) del df print(hidden, flush=True) print('aligning input datamatrix and embedding datamatrix...', flush=True) if observed_.shape[0] == hidden.shape[0] and (observed_.rowlabels == hidden.rowlabels).all(): observed = copy.deepcopy(observed_) else: observed = observed_.tolabels(rowlabels=hidden.rowlabels.copy()) hidden.rowmeta = copy.deepcopy(observed.rowmeta) print(observed, flush=True) # visualization print('plotting embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.plot(hidden.matrix[:,0], hidden.matrix[:,1], 'ok', markersize=1, markeredgewidth=0, alpha=0.5, zorder=0) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in ['mean', 'stdv', 'position']: z = hidden.rowmeta[metalabel].astype('float64') fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.scatter(hidden.matrix[:,0], hidden.matrix[:,1], s=1, c=z, marker='o', edgecolors='none', cmap=plt.get_cmap('jet'), alpha=0.5, vmin=z.min(), vmax=z.max()) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in ['gene_name']: categories = np.unique(hidden.rowmeta[metalabel]) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = hidden.rowmeta[metalabel] == category ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() hla_hit = np.array(['HLA-' in x for x in hidden.rowmeta['gene_name']], dtype='bool') hla_names = hidden.rowmeta['gene_name'].copy() hla_names[~hla_hit] = 'NA' categories = np.unique(hla_names) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = hla_names == category ax.plot(hidden.matrix[hit,0], hidden.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=1, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_colored_by_hlagene.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() print('computing right factor matrix...', flush=True) rightfactormat, residuals, rank, singular_values = np.linalg.lstsq(hidden.matrix, observed.matrix) factored = dc.datamatrix(rowname=observed.columnname, rowlabels=observed.columnlabels.copy(), rowmeta=copy.deepcopy(observed.columnmeta), columnname='latent_component', columnlabels=np.array(['Latent1', 'Latent2'], dtype='object'), columnmeta={}, matrixname=observed.columnname + '_embedding_from_' + observed.matrixname, matrix=rightfactormat.T) print('plotting transpose embedding...', flush=True) fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) ax.plot(factored.matrix[:,0], factored.matrix[:,1], 'ok', markersize=2, markeredgewidth=0, alpha=0.5, zorder=0) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_transpose.png'.format(output_path_prefix), transparent=True, pad_inches=0, dpi=300) plt.close() for metalabel in factored.rowmeta: # ['population', 'super_population', 'gender']: categories = np.unique(factored.rowmeta[metalabel]) cmap = plt.get_cmap('gist_rainbow') colors = [cmap(float((i+0.5)/len(categories))) for i in range(len(categories))] fg, ax = plt.subplots(1, 1, figsize=(6.5,4.3)) ax.set_position([0.15/6.5, 0.15/4.3, 4.0/6.5, 4.0/4.3]) for category, color in zip(categories, colors): if category == 'NA': color = 'k' alpha = 0.1 zorder = 0 else: alpha = 0.5 zorder = 1 hit = factored.rowmeta[metalabel] == category ax.plot(factored.matrix[hit,0], factored.matrix[hit,1], linestyle='None', linewidth=0, marker='o', markerfacecolor=color, markeredgecolor=color, markersize=2, markeredgewidth=0, alpha=alpha, zorder=zorder, label=category) ax.tick_params(axis='both', which='major', bottom=False, top=False, labelbottom=False, labeltop=False, left=False, right=False, labelleft=False, labelright=False) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderaxespad=0, frameon=False, ncol=1, numpoints=1, markerscale=2, fontsize=8, labelspacing=0.25) # ax.set_title('expl_var_frac: {0:1.3g}'.format(pca_model.explained_variance_ratio_.sum()), fontsize=8) ax.set_frame_on(False) fg.savefig('{0}_transpose_colored_by_{1}.png'.format(output_path_prefix, metalabel), transparent=True, pad_inches=0, dpi=300) plt.close() print('done plot_embeddings.py', flush=True)
def get_classifier_performance_stats(Y, P, uP=1000, classifier_stats='all', plot_curves=True, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01): if type(uP) == int: uP = get_unique_pcuts(P=P, max_cuts=uP).reshape(-1, 1) elif len(uP.shape) == 1: uP = uP.reshape(-1, 1) if type(classifier_stats) == str: classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') n = np.float64(Y.size) + 0.2 ap = Y.sum().astype('float64') + 0.1 an = (~Y).sum().astype('float64') + 0.1 pp = (P >= uP).sum(1).astype('float64') + 0.1 pn = (P < uP).sum(1).astype('float64') + 0.1 tp = np.logical_and(P >= uP, Y).sum(1).astype( 'float64') + 0.05 # if count is 5, then this introduces 1% error fp = np.logical_and(P >= uP, ~Y).sum(1).astype( 'float64') + 0.05 # so don't take seriously any cut-off where tn = np.logical_and( P < uP, ~Y).sum(1).astype('float64') + 0.05 # any count is less than 5 fn = np.logical_and(P < uP, Y).sum(1).astype( 'float64' ) + 0.05 # nnt is extremely sensitive to this adjustment, but not where nnt is actually reasonable uP = uP.reshape(-1) tpr = tp / ap # sensitivity, recall, 1-fnr fpr = fp / an # fall-out, 1-tnr, 1-specificity auroc = np.trapz(tpr, fpr) fnr = fn / ap # miss rate tnr = tn / an # specificity mcr = (fp + fn) / n acc = (tp + tn) / n fdr = fp / pp ppv = tp / pp # precision = 1-fdr auprc = np.trapz(ppv, tpr) fomr = fn / pn # false omission rate npv = tn / pn plr = (tp / fp) / ( ap / an ) # ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better, tpr/fpr nlr = (fn / tn) / ( ap / an ) # ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better, fnr/tnr dor = (tp / fp) / ( fn / tn ) # ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions, positivelikelihoodratio/negativelikelihoodratio drr = (tp / pp) / ( fn / pn ) # relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions, ppv/fomr darr = (tp / pp) - ( fn / pn ) # absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions, ppv - fomr mrr = (tp / pp) / ( ap / n ) # modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample, ppv/prevalence marr = (tp / pp) - ( ap / n ) # modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample, ppv - prevalence f1s = 2 * tp / (2 * tp + fp + fn) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) fnlp = -stats.hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) results_dict = { 'p': uP, 'n': n, 'ap': ap, 'an': an, 'pp': pp, 'pn': pn, 'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn, 'tpr': tpr, 'fpr': fpr, 'auroc': auroc, 'fnr': fnr, 'tnr': tnr, 'mcr': mcr, 'acc': acc, 'fdr': fdr, 'ppv': ppv, 'auprc': auprc, 'fomr': fomr, 'npv': npv, 'plr': plr, 'nlr': nlr, 'dor': dor, 'drr': drr, 'darr': darr, 'mrr': mrr, 'marr': marr, 'f1s': f1s, 'mcc': mcc, 'fnlp': fnlp } stat_cut = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='probability_cutoff', columnlabels=uP.copy(), columnmeta={}, matrixname='classifier_performance_stats_vs_probability_cutoffs', matrix=np.zeros((classifier_stats.size, uP.size), dtype='float64')) for i, stat in enumerate(stat_cut.rowlabels): stat_cut.matrix[i, :] = results_dict[stat] if get_priority_cutoffs: get_priority_cutoff_metadata(stat_cut, pp_min_frac, xx_min_frac) if plot_curves: plt.figure() plt.subplot(2, 2, 1) plt.plot(fpr, tpr, 'k-') plt.ylabel('tpr, sensitivity, recall') plt.xlabel('fpr, 1-specificity, fall-out') plt.axis([0, 1, 0, 1]) plt.subplot(2, 2, 2) plt.plot(tpr, ppv, 'k-') plt.ylabel('ppv, precision, 1-fdr') plt.xlabel('tpr, sensitivity, recall') plt.axis([0, 1, 0, 1]) plt.subplot(2, 2, 3) plt.plot(uP, mcr, 'k-') plt.ylabel('mcr') plt.xlabel('p') plt.axis([0, 1, 0, 1]) plt.gca().invert_xaxis() plt.subplot(2, 2, 4) plt.plot(uP, mcc, 'k-') plt.ylabel('mcc') plt.xlabel('p') plt.axis([0, 1, 0, 1]) plt.gca().invert_xaxis() return stat_cut
def get_classifier_performance_stats(Y, P, uP=1000, classifier_stats='all', plot_curves=True, get_priority_cutoffs=True, pp_min_frac=0.1, xx_min_frac=0.01): if type(uP) == int: uP = get_unique_pcuts(P=P, max_cuts=uP).reshape(-1, 1) elif len(uP.shape) == 1: uP = uP.reshape(-1, 1) if type(classifier_stats) == str: classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') n = np.float64(Y.size) + 0.2 ap = Y.sum().astype('float64') + 0.1 an = (~Y).sum().astype('float64') + 0.1 pp = (P >= uP).sum(1).astype('float64') + 0.1 pn = (P < uP).sum(1).astype('float64') + 0.1 tp = np.logical_and(P >= uP, Y).sum(1).astype( 'float64') + 0.05 # if count is 5, then this introduces 1% error fp = np.logical_and(P >= uP, ~Y).sum(1).astype( 'float64') + 0.05 # so don't take seriously any cut-off where tn = np.logical_and( P < uP, ~Y).sum(1).astype('float64') + 0.05 # any count is less than 5 fn = np.logical_and(P < uP, Y).sum(1).astype( 'float64' ) + 0.05 # nnt is extremely sensitive to this adjustment, but not where nnt is actually reasonable uP = uP.reshape(-1) stat_fun_params = (uP, n, ap, an, pp, pn, tp, fp, tn, fn) stat_fun = { 'p': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: P, 'n': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: N, 'ap': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: AP, 'an': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: AN, 'pp': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: PP, 'pn': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: PN, 'tp': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP, 'fp': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP, 'tn': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN, 'fn': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN, 'tpr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP / AP, 'fpr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP / AN, 'auroc': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: np.trapz( TP / AP, FP / AN), 'fnr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN / AP, 'tnr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN / AN, 'mcr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (FP + FN) / N, 'acc': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP + TN) / N, 'fdr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FP / PP, 'ppv': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TP / PP, 'auprc': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: np.trapz( TP / PP, TP / AP), 'fomr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: FN / PN, 'npv': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: TN / PN, 'plr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / FP) / (AP / AN), 'nlr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (FN / TN) / (AP / AN), 'dor': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / FP) / (FN / TN), 'drr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) / (FN / PN), 'darr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) - (FN / PN), 'mrr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) / (AP / N), 'marr': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP / PP) - (AP / N), 'f1s': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: 2 * TP / (2 * TP + FP + FN), 'mcc': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: (TP * TN - FP * FN) / np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN)), 'fnlp': lambda P, N, AP, AN, PP, PN, TP, FP, TN, FN: -stats.hypergeom.logsf( TP, N, AP, PP, loc=1) / np.log(10) } stat_cut = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='probability_cutoff', columnlabels=uP.copy(), columnmeta={}, matrixname='classifier_performance_stats_vs_probability_cutoffs', matrix=np.zeros((classifier_stats.size, uP.size), dtype='float64')) for i, stat in enumerate(stat_cut.rowlabels): stat_cut.matrix[i, :] = stat_fun[stat](*stat_fun_params) if get_priority_cutoffs: get_priority_cutoff_metadata(stat_cut, pp_min_frac, xx_min_frac) if plot_curves: plt.figure() plt.subplot(2, 2, 1) plt.plot(stat_cut.select('fpr', []), stat_cut.select('tpr', []), 'k-') plt.ylabel('tpr, sensitivity, recall') plt.xlabel('fpr, 1-specificity, fall-out') plt.axis([0, 1, 0, 1]) plt.subplot(2, 2, 2) plt.plot(stat_cut.select('tpr', []), stat_cut.select('ppv', []), 'k-') plt.ylabel('ppv, precision, 1-fdr') plt.xlabel('tpr, sensitivity, recall') plt.axis([0, 1, 0, 1]) plt.subplot(2, 2, 3) plt.plot(stat_cut.select('p', []), stat_cut.select('mcr', []), 'k-') plt.ylabel('mcr') plt.xlabel('p') plt.axis([0, 1, 0, 1]) plt.gca().invert_xaxis() plt.subplot(2, 2, 4) plt.plot(stat_cut.select('p', []), stat_cut.select('mcc', []), 'k-') plt.ylabel('mcc') plt.xlabel('p') plt.axis([0, 1, 0, 1]) plt.gca().invert_xaxis() return stat_cut
results = dc.datamatrix( rowname=subds.columnname, rowlabels=subds.columnlabels.copy(), rowmeta=copy.deepcopy(subds.columnmeta), columnname='statistic_summary_numsamples', columnlabels=np.array([ '{0}_N{1!s}'.format(x, y) for x in [ 'pc1loadings_mean', 'pc1loadings_stdv', 'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv', 'pranks_mean', 'pranks_stdv', 'tranks_mean', 'tranks_stdv', 'dranks_mean', 'dranks_stdv', 'significanceindicators_mean', 'significanceindicators_stdv' ] for y in num_samples ], dtype='object'), columnmeta={ 'statistic_summary': np.array([ x for x in [ 'pc1loadings_mean', 'pc1loadings_stdv', 'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv', 'pranks_mean', 'pranks_stdv', 'tranks_mean', 'tranks_stdv', 'dranks_mean', 'dranks_stdv', 'significanceindicators_mean', 'significanceindicators_stdv' ] for y in num_samples ], dtype='object'), 'statistic': np.array([ x.split('_')[0] for x in [ 'pc1loadings_mean', 'pc1loadings_stdv', 'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv', 'pranks_mean', 'pranks_stdv', 'tranks_mean', 'tranks_stdv', 'dranks_mean', 'dranks_stdv', 'significanceindicators_mean', 'significanceindicators_stdv' ] for y in num_samples ], dtype='object'), 'summary': np.array([ x.split('_')[1] for x in [ 'pc1loadings_mean', 'pc1loadings_stdv', 'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv', 'pranks_mean', 'pranks_stdv', 'tranks_mean', 'tranks_stdv', 'dranks_mean', 'dranks_stdv', 'significanceindicators_mean', 'significanceindicators_stdv' ] for y in num_samples ], dtype='object'), 'numsamples': np.array([ y for x in [ 'pc1loadings_mean', 'pc1loadings_stdv', 'reconerrors_mean', 'reconerrors_stdv', 'tvalues_mean', 'tvalues_stdv', 'dvalues_mean', 'dvalues_stdv', 'pranks_mean', 'pranks_stdv', 'tranks_mean', 'tranks_stdv', 'dranks_mean', 'dranks_stdv', 'significanceindicators_mean', 'significanceindicators_stdv' ] for y in num_samples ], dtype='int64') }, matrixname='gene_statistics_for_{0}_vs_{1}'.format( group_i, group_j), matrix=np.concatenate( (pc1loadings_mean, pc1loadings_stdv, np.broadcast_to(reconerrors_mean.reshape(-1, 1), (num_samples.size, subds.shape[1])), np.broadcast_to(reconerrors_stdv.reshape(-1, 1), (num_samples.size, subds.shape[1])), tvalues_mean, tvalues_stdv, dvalues_mean, dvalues_stdv, pranks_mean, pranks_stdv, tranks_mean, tranks_stdv, dranks_mean, dranks_stdv, significanceindicators_mean, significanceindicators_stdv), 0).T)
def main(validation_rep=0, validation_fold=0): # load dataset info print('loading dataset info...', flush=True) dataset_info_path = 'datasets/merged_features/rep{0!s}_fold{1!s}/dataset_info.txt'.format( validation_rep, validation_fold) dataset_info = datasetIO.load_datasetinfo(dataset_info_path)[0] # load validation examples print('loading validation examples...', flush=True) validation_examples_path = 'targets/validation_examples/rep{0!s}_fold{1!s}.txt'.format( validation_rep, validation_fold) with open(validation_examples_path, mode='rt', encoding='utf-8', errors='surrogateescape') as fr: validation_examples = fr.read().split('\n') # specify results folder print('specifying results folder...', flush=True) results_folder = 'datasets/useful_features/rep{0!s}_fold{1!s}'.format( validation_rep, validation_fold) results_folder_parts = results_folder.split('/') for i in range(len(results_folder_parts)): results_folder_part = '/'.join(results_folder_parts[:i + 1]) if not os.path.exists(results_folder_part): os.mkdir(results_folder_part) # load dataset print('loading dataset {0}...'.format(dataset_info['abbreviation']), flush=True) gene_atb = datasetIO.load_datamatrix(datasetpath=dataset_info['path']) # specify cross-validation parameters print('specifying cross-validation parameters...', flush=True) reps = 20 folds = 5 rf_trees = 1000 include_logistic_regression = True skf = StratifiedKFold(n_splits=folds, shuffle=True) print(' reps: {0!s}'.format(reps)) print(' folds: {0!s}'.format(folds)) # initialize models print('initializing models...', flush=True) rfmodel = RandomForestClassifier(n_estimators=rf_trees, oob_score=False, n_jobs=-1, class_weight='balanced') print(rfmodel) lrmodel = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1e3, fit_intercept=True, intercept_scaling=1e3, class_weight='balanced', random_state=None, solver='liblinear', max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=1) print(lrmodel) # initialize data matrices for collecting model feature importances and cross-validation performance stats print( 'initializing data matrices for collecting model feature importances and cross-validation performance stats...', flush=True) classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') sm = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='crossvalidation_classifier_performance_stats_vs_models', matrix=np.zeros((classifier_stats.size, gene_atb.shape[1]), dtype='float64')) stat_model_rf_mean = copy.deepcopy(sm) stat_model_rf_stdv = copy.deepcopy(sm) stat_model_lr_mean = copy.deepcopy(sm) stat_model_lr_stdv = copy.deepcopy(sm) del sm fm = dataclasses.datamatrix( rowname=gene_atb.columnname, rowlabels=gene_atb.columnlabels.copy(), rowmeta=copy.deepcopy(gene_atb.columnmeta), columnname='model', columnlabels=np.array(['M' + str(x) for x in range(gene_atb.shape[1])], dtype='object'), columnmeta={ 'num_features': np.zeros(gene_atb.shape[1], dtype='int64'), 'features': np.full(gene_atb.shape[1], '', dtype='object'), 'oob_score': np.zeros(gene_atb.shape[1], dtype='float64') }, matrixname='model_feature_importances', matrix=np.zeros((gene_atb.shape[1], gene_atb.shape[1]), dtype='float64')) feature_model_rf = copy.deepcopy(fm) feature_model_lr = copy.deepcopy(fm) del fm # exclude validation and unlabeled examples from cross-validation loop print( 'excluding validation and unlabeled examples from cross-validation loop...', flush=True) isvalidation = np.in1d(gene_atb.rowlabels, validation_examples) isunknown = gene_atb.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) Y = (gene_atb.rowmeta['class'][istraintest] == 'positive') #X = gene_atb.matrix[istraintest,:] # perform incremental feature elimination with cross-validation print( 'performing incremental feature elimination with cross-validation...', flush=True) for i in range(gene_atb.shape[1]): print(' features: {0!s}...'.format(gene_atb.shape[1] - i), flush=True) if i == 0: hit_rf = np.ones(gene_atb.shape[1], dtype='bool') hit_lr = np.ones(gene_atb.shape[1], dtype='bool') else: hit_rf = feature_model_rf.matrix[:, i - 1] > feature_model_rf.matrix[ feature_model_rf. matrix[:, i - 1] > 0, i - 1].min() #hit_lr = feature_model_lr.matrix[:,i-1] > feature_model_lr.matrix[feature_model_lr.matrix[:,i-1] > 0,i-1].min() hit_lr = hit_rf X_rf = gene_atb.matrix[istraintest, :][:, hit_rf] X_lr = gene_atb.matrix[istraintest, :][:, hit_lr] stat_rep_rf = np.zeros((classifier_stats.size, reps), dtype='float64') stat_rep_lr = np.zeros((classifier_stats.size, reps), dtype='float64') fi_rep_rf = np.zeros((X_rf.shape[1], reps), dtype='float64') fi_rep_lr = np.zeros((X_lr.shape[1], reps), dtype='float64') for rep in range(reps): print(' rep {0!s} of {1!s}...'.format(rep + 1, reps), flush=True) Ptest_rf = np.zeros(Y.size, dtype='float64') Ptest_lr = np.zeros(Y.size, dtype='float64') fi_fold_rf = np.zeros((X_rf.shape[1], folds), dtype='float64') fi_fold_lr = np.zeros((X_lr.shape[1], folds), dtype='float64') for fold, (train_indices, test_indices) in enumerate(skf.split(X_rf, Y)): print(' fold {0!s} of {1!s}...'.format( fold + 1, folds), flush=True) Y_train = Y[train_indices] X_rf_train = X_rf[train_indices] X_lr_train = X_lr[train_indices] #Y_test = Y[test_indices] X_rf_test = X_rf[test_indices] X_lr_test = X_lr[test_indices] rfmodel.fit(X_rf_train, Y_train) Ptest_rf[test_indices] = rfmodel.predict_proba( X_rf_test)[:, rfmodel.classes_ == 1].reshape(-1) fi_fold_rf[:, fold] = rfmodel.feature_importances_ lrmodel.fit(X_lr_train, Y_train) Ptest_lr[test_indices] = lrmodel.predict_proba( X_lr_test)[:, lrmodel.classes_ == 1].reshape(-1) fi_fold_lr[:, fold] = np.abs(lrmodel.coef_.reshape(-1)) fi_rep_rf[:, rep] = fi_fold_rf.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_rf, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_rf[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) fi_rep_lr[:, rep] = fi_fold_lr.mean(1) stat_cut = modelevaluation.get_classifier_performance_stats( Y=Y, P=Ptest_lr, classifier_stats=classifier_stats, plot_curves=False, get_priority_cutoffs=True) stat_rep_lr[:, rep] = stat_cut.matrix[:, stat_cut.columnmeta[ 'p50_cutoff']].reshape(-1) feature_model_rf.matrix[hit_rf, i] = fi_rep_rf.mean(1) feature_model_rf.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_rf.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_mean.matrix[:, i] = stat_rep_rf.mean(1) stat_model_rf_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) stat_model_rf_stdv.matrix[:, i] = stat_rep_rf.std(1) stat_model_rf_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_rf_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_rf].tolist()) feature_model_lr.matrix[hit_lr, i] = fi_rep_lr.mean(1) feature_model_lr.columnmeta['num_features'][i] = gene_atb.shape[1] - i feature_model_lr.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_mean.matrix[:, i] = stat_rep_lr.mean(1) stat_model_lr_mean.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_mean.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) stat_model_lr_stdv.matrix[:, i] = stat_rep_lr.std(1) stat_model_lr_stdv.columnmeta['num_features'][ i] = gene_atb.shape[1] - i stat_model_lr_stdv.columnmeta['features'][i] = '|'.join( gene_atb.columnlabels[hit_lr].tolist()) # concatenate data matrices with model feature importances print('concatenating data matrices with model feature importances...', flush=True) feature_model_rf.columnlabels += '_rf' feature_model_rf.columnmeta['model_type'] = np.full( feature_model_rf.shape[1], 'random_forest', dtype='object') feature_model_lr.columnlabels += '_lr' feature_model_lr.columnmeta['model_type'] = np.full( feature_model_lr.shape[1], 'logistic_regression', dtype='object') feature_model_rf.append(feature_model_lr, 1) feature_model = feature_model_rf del feature_model_rf, feature_model_lr # concatenate data matrices with model cross-validation performance stats print( 'concatenating data matrices with model cross-validation performance stats...', flush=True) stat_model_rf_mean.rowlabels += '_mean' stat_model_rf_stdv.rowlabels += '_stdv' stat_model_rf_mean.append(stat_model_rf_stdv, 0) stat_model_rf_mean.columnlabels += '_rf' stat_model_rf_mean.columnmeta['model_type'] = np.full( stat_model_rf_mean.shape[1], 'random_forest', dtype='object') stat_model_lr_mean.rowlabels += '_mean' stat_model_lr_stdv.rowlabels += '_stdv' stat_model_lr_mean.append(stat_model_lr_stdv, 0) stat_model_lr_mean.columnlabels += '_lr' stat_model_lr_mean.columnmeta['model_type'] = np.full( stat_model_lr_mean.shape[1], 'logistic_regression', dtype='object') stat_model_rf_mean.append(stat_model_lr_mean, 1) stat_model = stat_model_rf_mean del stat_model_rf_mean # select simplest model (fewest features) with auroc and auprc within 95% of max print( 'selecting simplest model (fewest features) with auroc and auprc within 95% of max...', flush=True) model_scores = 0.5 * (stat_model.select('auroc_mean', []) + stat_model.select('auprc_mean', [])) if include_logistic_regression: selected_model_index = np.where( model_scores >= 0.95 * model_scores.max())[0][-1] else: selected_model_index = np.where( np.logical_and( model_scores >= 0.95 * model_scores[stat_model.columnmeta['model_type'] == 'random_forest'].max(), stat_model.columnmeta['model_type'] == 'random_forest'))[0][-1] selected_model_name = stat_model.columnlabels[selected_model_index] selected_model_features = feature_model.rowlabels[ feature_model.matrix[:, selected_model_index] != 0] selected_model_type = stat_model.columnmeta['model_type'][ selected_model_index] selected_model = rfmodel if selected_model_type == 'random_forest' else lrmodel gene_atb = gene_atb.tolabels(columnlabels=selected_model_features) feature_model_selected = feature_model.tolabels( columnlabels=selected_model_name) stat_model_selected = stat_model.tolabels(columnlabels=selected_model_name) print(' selected_model_name: {0}'.format(selected_model_name), flush=True) print(' selected_model_features: {0}'.format( '|'.join(selected_model_features)), flush=True) # iterate over selected features to rebuild design matrix print('iterating over selected features to rebuild design matrix...', flush=True) for i, (selected_feature, dataset_abbreviation) in enumerate( zip(gene_atb.columnlabels, gene_atb.columnmeta['dataset_abbreviation'])): # load dataset print(' loading dataset {0}...'.format(dataset_abbreviation), flush=True) dataset_path = 'datasets/generalizable_features/rep{0!s}_fold{1!s}/{2}.txt.gz'.format( validation_rep, validation_fold, dataset_abbreviation) gene_atb_i = datasetIO.load_datamatrix(dataset_path) gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'] = gene_atb_i.columnmeta[ 'generalizability_pvalues_corrected'].astype('float64') gene_atb_i.columnmeta['dataset_abbreviation'] = np.full( gene_atb_i.shape[1], dataset_abbreviation, dtype='object') gene_atb_i.columnmeta[ 'dataset_feature'] = gene_atb_i.columnlabels.copy() gene_atb_i.columnlabels += '_' + dataset_abbreviation gene_atb_i.rowname = 'GeneSym' gene_atb_i.columnname = 'Feature' if dataset_abbreviation == 'gtextissue_cleaned': gene_atb_i.discard(gene_atb_i.rowlabels == 'C12ORF55', 0) # pesky duplicate row print(gene_atb_i) # select feature print(' selecting feature {0}...'.format(selected_feature), flush=True) gene_atb_i.discard(gene_atb_i.columnlabels != selected_feature, 1) # merge dataset print(' merging dataset...', flush=True) if i == 0: gene_atb_selected = copy.deepcopy(gene_atb_i) gene_atb_selected.matrixname = 'merged_target_features' print(' first dataset, no merge...', flush=True) else: common_genes = np.intersect1d(gene_atb_selected.rowlabels, gene_atb_i.rowlabels) gene_atb_selected = gene_atb_selected.tolabels( rowlabels=common_genes) gene_atb_i = gene_atb_i.tolabels(rowlabels=common_genes) gene_atb_selected.append(gene_atb_i, 1) print(' common_genes: {0!s}...'.format(common_genes.size), flush=True) # normalize features print('normalizing features...', flush=True) gene_atb_selected.columnmeta['min'] = gene_atb_selected.matrix.min(0) gene_atb_selected.columnmeta['max'] = gene_atb_selected.matrix.max(0) gene_atb_selected.matrix = ( gene_atb_selected.matrix - gene_atb_selected.columnmeta['min'].reshape( 1, -1)) / (gene_atb_selected.columnmeta['max'].reshape(1, -1) - gene_atb_selected.columnmeta['min'].reshape(1, -1)) # update metadata print('updating metadata...', flush=True) assert (gene_atb.columnlabels == gene_atb_selected.columnlabels).all() for field, values in gene_atb.columnmeta.items(): if field not in gene_atb_selected.columnmeta: gene_atb_selected.columnmeta[field] = values print('old_num_genes:{0!s}\tnew_num_genes:{1!s}'.format( gene_atb.shape[0], gene_atb_selected.shape[0]), flush=True) del gene_atb # refit selected model print('refitting selected model...', flush=True) isvalidation = np.in1d(gene_atb_selected.rowlabels, validation_examples) isunknown = gene_atb_selected.rowmeta['class'] == 'unknown' istraintest = ~np.logical_or(isvalidation, isunknown) selected_model.fit( gene_atb_selected.matrix[istraintest, :], gene_atb_selected.rowmeta['class'][istraintest] == 'positive') # get predictions for validation and unlabelled examples print('getting predictions for validation and unlabelled examples...', flush=True) gene_model_selected = dataclasses.datamatrix( rowname=gene_atb_selected.rowname, rowlabels=gene_atb_selected.rowlabels.copy(), rowmeta=copy.deepcopy(gene_atb_selected.rowmeta), columnname=stat_model_selected.columnname, columnlabels=stat_model_selected.columnlabels.copy(), columnmeta=copy.deepcopy(stat_model_selected.columnmeta), matrixname= 'success_probabilities_for_validation_and_unlabelled_examples', matrix=selected_model.predict_proba( gene_atb_selected.matrix)[:, selected_model.classes_ == 1]) gene_model_selected.discard(istraintest, 0) # save results print('saving {0!s} useful features and model results...'.format( gene_atb_selected.shape[1]), flush=True) dataset_info['path'] = '{0}/{1}.txt.gz'.format( results_folder, dataset_info['abbreviation']) dataset_info['selected_model_name'] = selected_model_name dataset_info['selected_model_features'] = '|'.join(selected_model_features) dataset_info['selected_model_type'] = selected_model_type dataset_info['crossvalidation_reps'] = reps dataset_info['crossvalidation_folds'] = folds dataset_info['rf_trees'] = rf_trees dataset_info['include_logistic_regression'] = include_logistic_regression for stat_name, stat_values in zip(stat_model_selected.rowlabels, stat_model_selected.matrix): dataset_info[stat_name] = stat_values.item() datasetIO.save_datamatrix(dataset_info['path'], gene_atb_selected) datasetIO.save_datamatrix('{0}/stat_model.txt.gz'.format(results_folder), stat_model) datasetIO.save_datamatrix( '{0}/feature_model.txt.gz'.format(results_folder), feature_model) datasetIO.save_datamatrix( '{0}/stat_model_selected.txt.gz'.format(results_folder), stat_model_selected) datasetIO.save_datamatrix( '{0}/feature_model_selected.txt.gz'.format(results_folder), feature_model_selected) datasetIO.save_datamatrix( '{0}/gene_model_selected.txt.gz'.format(results_folder), gene_model_selected) datasetIO.append_datasetinfo('{0}/dataset_info.txt'.format(results_folder), dataset_info) print('done.', flush=True)
classifier_cutoff = 'mcc_cutoff' classifier_stats = np.array([ 'p', 'n', 'ap', 'an', 'pp', 'pn', 'tp', 'fp', 'tn', 'fn', 'tpr', 'fpr', 'auroc', 'fnr', 'tnr', 'mcr', 'acc', 'fdr', 'ppv', 'auprc', 'fomr', 'npv', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1s', 'mcc', 'fnlp' ], dtype='object') # classifier stats for each of 200 repetitions of cross-validation stat_rep = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='validation_rep', columnlabels=np.array(['Rep' + str(x) for x in range(validation_reps)], dtype='object'), columnmeta={'validation_folds': np.zeros(validation_reps, dtype='int64')}, matrixname= 'crossvalidation_classifier_performance_stats_across_validation_reps', matrix=np.zeros((classifier_stats.size, validation_reps), dtype='float64')) # classifier stats for each of 200reps*5folds=1000 train-test cycles stat_fold = dataclasses.datamatrix( rowname='classifier_performance_stat', rowlabels=classifier_stats.copy(), rowmeta={}, columnname='validation_rep_and_fold', columnlabels=np.full(validation_reps * validation_folds, '', dtype='object'),
gene_names.append('NA') gene_types.append('NA') entrez_ids.append('NA') print('creating datamatrix object...', flush=True) dataset[partition] = dc.datamatrix( rowname='rsid', rowlabels=np.array(rsids, dtype='object'), rowmeta={ 'chromosome': np.array(chroms, dtype='object'), 'position': np.array(poss, dtype='object'), 'ref_allele': np.array(refs, dtype='object'), 'alt_allele': np.array(alts, dtype='object'), 'ensembl_gene_id': np.array(ensembl_gene_ids, dtype='object'), 'gene_name': np.array(gene_names, dtype='object'), 'gene_type': np.array(gene_types, dtype='object'), 'entrez_id': np.array(entrez_ids, dtype='object') }, columnname='genome_id', columnlabels=np.array(genome_ids, dtype='object'), columnmeta={ 'population': np.array(pops, dtype='object'), 'super_population': np.array(super_pops, dtype='object'), 'gender': np.array(genders, dtype='object') }, matrixname='MHC_phased_genotypes_from_1000_genomes', matrix=np.array(genotype_matrix, dtype='float32')) print(dataset[partition], flush=True) for i in range(5): printdict = { dataset[partition].rowname: dataset[partition].rowlabels[i]
def load_datamatrix(datasetpath, delimiter='\t', dtype='float64', getmetadata=True, getmatrix=True): if '.pickle' in datasetpath: with open(datasetpath, 'rb') as fr: return pickle.load(fr) else: if '.gz' in datasetpath: openfunc = gzip.open else: openfunc = open with openfunc(datasetpath, mode='rt', encoding="utf-8", errors="surrogateescape") as fr: rowmeta = {} columnmeta = {} rowlabels = [] entries = [x.strip() for x in fr.readline().split(delimiter)] skipcolumns = sum([entry == '#' for entry in entries]) + 1 columnname = entries[skipcolumns - 1] columnlabels = np.array(entries[skipcolumns:], dtype='object') firstentry = entries[0] skiprows = 1 if getmetadata: while firstentry == '#': entries = [ x.strip() for x in fr.readline().split(delimiter) ] columnmetaname = entries[skipcolumns - 1].split('/')[-1] if columnmetaname.lower() != 'na': columnmeta[columnmetaname] = np.array( entries[skipcolumns:], dtype='object') firstentry = entries[0] skiprows += 1 rowname = firstentry rowmetanames = entries[1:skipcolumns] if len(rowmetanames) > 0: rowmetanames[-1] = rowmetanames[-1].split('/')[0] rowmetaname_idx = {} for i, rowmetaname in enumerate(rowmetanames): if rowmetaname.lower() != 'na': rowmeta[rowmetaname] = [] rowmetaname_idx[rowmetaname] = i for line in fr: entries = [ x.strip() for x in line.split( delimiter, maxsplit=skipcolumns)[:skipcolumns] ] rowlabels.append(entries.pop(0)) for rowmetaname, idx in rowmetaname_idx.items(): rowmeta[rowmetaname].append(entries[idx]) rowlabels = np.array(rowlabels, dtype='object') for rowmetaname, rowmetavalues in rowmeta.items(): rowmeta[rowmetaname] = np.array(rowmetavalues, dtype='object') else: while firstentry == '#': entries = [ x.strip() for x in fr.readline().split(delimiter) ] firstentry = entries[0] skiprows += 1 rowname = firstentry for line in fr: rowlabels.append( line.split(delimiter, maxsplit=1)[0].strip()) rowlabels = np.array(rowlabels, dtype='object') if getmatrix: matrix = np.loadtxt(datasetpath, dtype=dtype, delimiter=delimiter, skiprows=skiprows, usecols=range(skipcolumns, len(columnlabels) + skipcolumns), ndmin=2) else: matrix = np.zeros((0, 0), dtype=dtype) matrixname = rowname + '_' + columnname + '_associations_from_' + datasetpath return dc.datamatrix(rowname, rowlabels, columnname, columnlabels, matrixname, matrix, rowmeta, columnmeta)
hit = np.in1d(sample_metadata['sample_id'], chosen_samples) for field, values in sample_metadata.items(): sample_metadata[field] = values[hit] run_ids = run_ids[hit] matrix = matrix = np.loadtxt( '../../original_data/GTEXv6plus/counts_gene.tsv.gz', dtype='float64', delimiter='\t', skiprows=1, usecols=hit.nonzero()[0], ndmin=2) gene_tissue = dataclasses.datamatrix( rowname='ensembl_gene_id', rowlabels=ensembl_gene_ids, rowmeta={}, columnname='recount2_run_id', columnlabels=run_ids, columnmeta=sample_metadata, matrixname='recount2_processed_rnaseq_counts_from_gtexv6', matrix=matrix) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.pickle', gene_tissue) datasetIO.save_datamatrix( '../../original_data/GTEXv6plus/gene_tissue_recount2gtexv6_chosen_samples_counts.txt.gz', gene_tissue)
def main(dictionaries, year, datestamp, min_score): print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) # set term dictionaries and paths to dicts containing PMIDs for each term # these files are generated by get_term_pmids_from_termite.py row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' row_pmids_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format( row_dictionary, year, datestamp, min_score) column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_pmids_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format( column_dictionary, year, datestamp, min_score) hucellanat_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format( 'HUCELLANAT', year, datestamp, min_score) if 'HUCELLANAT' in dictionaries and not os.path.exists(hucellanat_path): # combine HUCELL and ANAT term-pmid dicts into a single dict print('creating {0}...'.format(hucellanat_path), flush=True) with open(hucellanat_path.replace('ANAT', ''), 'rb') as fr: term_pmids = pickle.load(fr) with open(hucellanat_path.replace('HUCELL', ''), 'rb') as fr: term_pmids.update(pickle.load(fr)) with open(hucellanat_path, 'wb') as fw: pickle.dump(term_pmids, fw) del term_pmids hucellanatindication_path = 'term_pmid_dict_dictionary_{0}_year_{1}_datestamp_{2}_minscore_{3!s}.pickle'.format( 'HUCELLANATINDICATION', year, datestamp, min_score) if 'HUCELLANATINDICATION' in dictionaries and not os.path.exists( hucellanatindication_path): # combine HUCELL ANAT and INDICATION term-pmid dicts into a single dict print('creating {0}...'.format(hucellanatindication_path), flush=True) with open( hucellanatindication_path.replace('HUCELLANATINDICATION', 'HUCELL'), 'rb') as fr: term_pmids = pickle.load(fr) with open( hucellanatindication_path.replace('HUCELLANATINDICATION', 'ANAT'), 'rb') as fr: term_pmids.update(pickle.load(fr)) with open( hucellanatindication_path.replace('HUCELLANATINDICATION', 'INDICATION'), 'rb') as fr: term_pmids.update(pickle.load(fr)) with open(hucellanatindication_path, 'wb') as fw: pickle.dump(term_pmids, fw) del term_pmids # first dictionary of biomedical terms # load dict mapping terms to PMID sets # parse dict to rowlabels and rowmetadata print('loading row_dictionary: {0}...'.format(row_dictionary), flush=True) with open(row_pmids_path, 'rb') as fr: rowterm_pmids = pickle.load(fr) rowlabels, rowmeta = get_labels_and_metadata(rowterm_pmids) # second dictionary of biomedical terms # load dict mapping terms to PMID sets # parse dict to columnlabels and columnmetadata print('loading column_dictionary: {0}...'.format(column_dictionary), flush=True) if column_dictionary == row_dictionary: columnterm_pmids = rowterm_pmids columnlabels = rowlabels columnmeta = rowmeta else: with open(column_pmids_path, 'rb') as fr: columnterm_pmids = pickle.load(fr) columnlabels, columnmeta = get_labels_and_metadata(columnterm_pmids) # create datamatrix object for storing co-occurrence counts and marginal counts print( 'creating datamatrix object for storing co-occurrence counts and marginal counts...' ) term_term = dataclasses.datamatrix( rowname='term_dictidname', rowlabels=rowlabels.copy(), rowmeta=copy.deepcopy(rowmeta), columnname='term_dictidname', columnlabels=columnlabels.copy(), columnmeta=copy.deepcopy(columnmeta), matrixname='literature_cooccurrence_from_termite', matrix=np.zeros((rowlabels.size, columnlabels.size), dtype='int64')) del rowlabels, rowmeta, columnlabels, columnmeta print(term_term) # get co-occurrence counts and marginal counts print('calculating co-occurrence counts and marginal counts...') row_pmids_intersectionunion = defaultdict( set ) # the set of PMIDs mentioning row term i and any column term (union of all of the intersections) column_pmids_intersectionunion = defaultdict( set ) # the set of PMIDs mentioning column term j and any row term (union of all of the intersections) all_pmids_intersectionunion = set( ) # the set of PMIDs mentioning any row term AND any column term ("universe" is limited to publications that have at least one row term association AND at least one column term association) all_pmids_union = set( ) # the set of PMIDs mentioning any row term OR any column term ("universe" is limited to publications that have at least one row term association OR at least one column term association) # *** term_term_union_matrix = np.zeros(term_term.shape, dtype='int64') # the count of PMIDs mentioning row term i OR column term j for i, rowlabel in enumerate(term_term.rowlabels): if np.mod(i, 100) == 0 or i + 1 == term_term.shape[0]: print('working on row {0!s} of {1!s}...'.format( i + 1, term_term.shape[0]), flush=True) row_pmids = rowterm_pmids[rowlabel] for j, columnlabel in enumerate(term_term.columnlabels): column_pmids = columnterm_pmids[columnlabel] intersection_pmids = row_pmids.intersection(column_pmids) term_term.matrix[i, j] = len( intersection_pmids ) # the count of PMIDs mentioning row term i AND column term j # all_pmids_union = row_pmids.union(column_pmids) # term_term_union_matrix[i,j] = len(all_pmids_union) # the count of PMIDs mentioning row term i OR column term j if rowlabel != columnlabel: row_pmids_intersectionunion[rowlabel].update( intersection_pmids) column_pmids_intersectionunion[columnlabel].update( intersection_pmids) all_pmids_union.update(row_pmids) all_pmids_intersectionunion.update( row_pmids_intersectionunion[rowlabel]) for column_pmids in columnterm_pmids.values(): all_pmids_union.update(column_pmids) # include marginal counts as metadata print('including marginal counts as datamatrix metadata...') # relevant universe term_term.rowmeta['term_count_intersectionunion'] = np.array([ len(row_pmids_intersectionunion[rowlabel]) for rowlabel in term_term.rowlabels ], dtype='int64') term_term.columnmeta['term_count_intersectionunion'] = np.array( [ len(column_pmids_intersectionunion[columnlabel]) for columnlabel in term_term.columnlabels ], dtype='int64') term_term.rowmeta['all_count_intersectionunion'] = np.full( term_term.shape[0], len(all_pmids_intersectionunion), dtype='int64') term_term.columnmeta['all_count_intersectionunion'] = np.full( term_term.shape[1], len(all_pmids_intersectionunion), dtype='int64') # whole universe term_term.rowmeta['term_count_union'] = np.array( [len(rowterm_pmids[rowlabel]) for rowlabel in term_term.rowlabels], dtype='int64') term_term.columnmeta['term_count_union'] = np.array([ len(columnterm_pmids[columnlabel]) for columnlabel in term_term.columnlabels ], dtype='int64') term_term.rowmeta['all_count_union'] = np.full(term_term.shape[0], len(all_pmids_union), dtype='int64') term_term.columnmeta['all_count_union'] = np.full(term_term.shape[1], len(all_pmids_union), dtype='int64') # *** no need to calculate term_term_union_matrix # if want this as universe size, # start with universe size = all_count_intersectionunion # calculate true positive, true negatives, false positives, false negatives # subtract true negatives from universe size and set true negatives to zero # save results print('saving results...') datasetIO.save_datamatrix( '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.txt.gz' .format(row_dictionary, column_dictionary, year, datestamp, min_score), term_term) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle' .format(row_dictionary, column_dictionary, year, datestamp, min_score), term_term) print('done count_term-term_pmids_from_termite.py', flush=True)