def analysis_go(self, s_go, S_hit, N_total=0, SRC_GENE=None, min_overlap=3): c={'GO':s_go, '#TotalGeneInLibrary':N_total, '#GeneInGO':0, '#GeneInHitList':0, '#GeneInGOAndHitList':0, 'LogP':0.0, 'Enrichment':0} #if SRC_GENE is not None: # print "SRC_GENE: "+str(len(SRC_GENE)) S_gene=self.GO_GENE[s_go] if not N_total: N_total=len(self.ALL_GENE) if SRC_GENE is not None: S_gene=S_gene.intersection(SRC_GENE) S_hit=set(S_hit).intersection(SRC_GENE) else: S_hit=set(S_hit) c['#GeneInGO']=len(S_gene) c['#GeneInHitList']=len(S_hit) if c['#GeneInGO']<min_overlap or c['#GeneInHitList']<min_overlap: return None S_both=S_gene.intersection(S_hit) c['#GeneInGOAndHitList']=len(S_both) if c['#GeneInGOAndHitList']<min_overlap: return None c['%InGO']=c['#GeneInGOAndHitList']*100.0/c['#GeneInHitList'] q=min(max(c['%InGO']/100, 1.0/c['#GeneInHitList']), 1-1.0/c['#GeneInHitList']) c['STDV %InGO']=np.sqrt(q*(1-q)/c['#GeneInHitList'])*100 c['Enrichment']=c['%InGO']/100.0*N_total/c['#GeneInGO'] S=[int(x) for x in S_both] S.sort() c['GeneID']='|'.join([str(x) for x in S]) if c['#GeneInGOAndHitList']<min_overlap: return c c['LogP'] = hypergeom.logsf(c['#GeneInGOAndHitList']-1, N_total, c['#GeneInGO'], c['#GeneInHitList']) c['LogP'] = c['LogP'] / TO_LOG10 return c
def get_single_enrichment(self, gene_list, term, gene_universe=15000, gene_list_size=None): term_df = self.gene_term_df[self.gene_term_df[self.go_column_label] == term] n = len(term_df) x = sum(term_df[self.gene_column_label].isin(gene_list)) - 1 # matched genes N = gene_list_size if gene_list_size else len(gene_list) return hypergeom.logsf(x, gene_universe, n, N)
def _prob_hypergeo_fast(y_compute, name, X, M, n, N): """Compute hypergeometric Pvalue. Description ----------- Suppose you have a lot of 100 floppy disks (M), and you know that 20 of them are defective (n). What is the prbability of drawing zero to 2 floppy disks (N=2), if you select 10 at random (N). P=hypergeom.sf(2,100,20,10) """ P = np.nan logP = np.nan # M = len(yc) # Population size: Total number of samples, eg total number of genes; 10000 # n = np.sum(datac) # Number of successes in population, known in pathway, eg 2000 # N = np.sum(yc) # sample size: Random variate, eg clustersize or groupsize, over expressed genes, eg 300 # X = np.sum(np.logical_and(yc, datac.values)) - 1 # Let op, de -1 is belangrijk omdatje P<X wilt weten ipv P<=X. Als je P<=X doet dan kan je vele false positives krijgen als bijvoorbeeld X=1 en n=1 oid # Do the hypergeo-test if y_compute and (X > 0): P = hypergeom.sf(X, M, n, N) logP = hypergeom.logsf(X, M, n, N) # Store out = {} out['category_label'] = name out['P'] = P out['logP'] = logP out['overlap_X'] = X out['popsize_M'] = M out['nr_succes_pop_n'] = n out['samplesize_N'] = N out['dtype'] = 'categorical' return (out)
def calculate_enrichment(gene_data, go_to_genes, n=100): # get top n gene list top_sorted_genes = sorted(gene_data, key=lambda tup: tup[1], reverse=True) top_genes=top_sorted_genes[:n] top_gene_names= list(zip(*top_genes)[0]) # get bottom n gene list bot_sorted_genes = sorted(gene_data, key=lambda tup: tup[1]) bot_genes=bot_sorted_genes[:n] bot_gene_names= list(zip(*bot_genes)[0]) tot_genes = len(gene_data) # [+ top + goid (k), total genes (M), top genes (n), goid genes (N), score ] # create score dictionary top_score_list = dict((k,[0, tot_genes, n, len(go_to_genes[k]), 0]) for k in go_to_genes.keys()) bot_score_list = dict((k,[0, tot_genes, n, len(go_to_genes[k]), 0]) for k in go_to_genes.keys()) # calculate top hits for g in top_gene_names: for goid in top_score_list: if g in go_to_genes[goid]: top_score_list[goid][0] += 1 # calculate bottom hits for g in bot_gene_names: for goid in bot_score_list: if g in go_to_genes[goid]: bot_score_list[goid][0] += 1 positive_enrichment_scores = [] negative_enrichment_scores = [] # calculate scores for goid in top_score_list: top_score_list[goid][4] = hypergeom.logsf(top_score_list[goid][0]-1, top_score_list[goid][1], top_score_list[goid][2], top_score_list[goid][3]) positive_enrichment_scores.append((goid, top_score_list[goid][4])) bot_score_list[goid][4] = hypergeom.logsf(bot_score_list[goid][0]-1, bot_score_list[goid][1], bot_score_list[goid][2], bot_score_list[goid][3]) negative_enrichment_scores.append((goid, bot_score_list[goid][4])) # positive_enrichment_scores_sorted = sorted(positive_enrichment_scores, key=lambda tup: tup[1]) negative_enrichment_scores_sorted = sorted(negative_enrichment_scores, key=lambda tup: tup[1]) return positive_enrichment_scores_sorted,negative_enrichment_scores_sorted
def calc_enrich(self, path_file, all_sigs=True): #print('calc_enrich') hw_temp = self.hw_genes_all if (not all_sigs): hw_temp = self.hw_genes_all kegg = pandas.read_csv(path_file, header=None, sep='\t') temp_kegg_en = [1] * hw_temp.shape[1] for i in range(hw_temp.shape[1]): #print(i) path_en = [] sig_genes = self.compendium.index[np.where(hw_temp[:, i])] for j in range(kegg.shape[0]): path_genes = kegg[2][j].split(';') x = len(list(set(sig_genes) & set(path_genes))) - 1 n = self.weights.shape[0] k = len(sig_genes) m = len(path_genes) p = hypergeom.logsf(x, n, k, m) path_en.append(-p) #path_en_c = multipletests(path_en)[1] temp_kegg_en[i] = path_en kegg_df = pandas.DataFrame(temp_kegg_en, columns=kegg[0]) return (kegg_df.replace([np.inf, -np.inf, 'nan'], 0))
def main(datamatrix_path, test_index, response_variable_name, valid_index, valid_fraction, feature_fraction, regularization_type, inverse_regularization_strength, intercept_scaling, pos_neg_weight_ratio, evaluation_statistic, save_weights, save_folder, datamatrix): print('loading datamatrix...', flush=False) if datamatrix == None or type(datamatrix) == str: dm = datasetIO.load_datamatrix(datamatrix_path) else: dm = datamatrix print('setting random seed with test_index {0!s}...'.format(test_index), flush=False) np.random.seed(test_index) print('getting bootstrap sample...', flush=False) all_indices = np.arange(dm.shape[0]) boot_indices = np.random.choice(dm.shape[0], dm.shape[0], replace=True) test_indices = all_indices[~np.in1d(all_indices, boot_indices)] print('reserving out-of-bag samples as test set...', flush=False) Y = { 'test': dm.rowmeta[response_variable_name][test_indices].astype('bool') } X = {'test': dm.matrix[test_indices, :]} print('setting random seed with valid_index {0!s}...'.format(valid_index), flush=False) np.random.seed(valid_index) print('splitting bootstrap sample into training and validation sets...', flush=False) if type(valid_fraction) == str and (valid_fraction.lower() == 'loo' or valid_fraction.lower() == 'loocv'): valid_fraction = 'loo' valid_indices = all_indices train_indices = all_indices else: valid_indices = np.random.choice(dm.shape[0], round(valid_fraction * dm.shape[0]), replace=False) train_indices = all_indices[~np.in1d(all_indices, valid_indices)] Y['train'] = dm.rowmeta[response_variable_name][boot_indices][ train_indices].astype('bool') Y['valid'] = dm.rowmeta[response_variable_name][boot_indices][ valid_indices].astype('bool') X['train'] = dm.matrix[boot_indices, :][train_indices, :] X['valid'] = dm.matrix[boot_indices, :][valid_indices, :] print('fitting and evaluating models...', flush=False) stages = ['validation', 'testing'] data_subsets = ['fit', 'predict'] performance_stats = [ 'auroc', 'auprc', 'brier', 'nll', 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'mcc', 'fnlp', 'f1', 'f1_100', 'f1_50', 'f1_25', 'f1_10', 'f1_5', 'f1_3', 'f1_2', 'f2', 'f3', 'f5', 'f10', 'f25', 'f50', 'f100' ] if valid_fraction == 'loo': X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': X['train'], 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': Y['train'], 'predict': Y['test'] } }) else: X.update({ 'validation': { 'fit': X['train'], 'predict': X['valid'] }, 'testing': { 'fit': np.append(X['train'], X['valid'], 0), 'predict': X['test'] } }) Y.update({ 'validation': { 'fit': Y['train'], 'predict': Y['valid'] }, 'testing': { 'fit': np.append(Y['train'], Y['valid']), 'predict': Y['test'] } }) stat_subset = {} for stage in stages: print('working on {0} stage...'.format(stage), flush=False) if feature_fraction < 1: print('performing univariate feature selection...', flush=False) num_features = round(feature_fraction * dm.shape[1]) test_stats, p_values = ttest_ind( X[stage]['fit'][Y[stage]['fit'], :], X[stage]['fit'][~Y[stage]['fit'], :], axis=0, equal_var=False, nan_policy='propagate') ranks = np.argsort(p_values) selected_indices = ranks[:num_features] selected_features = dm.columnlabels[selected_indices] if stage == 'testing': print('plotting univariate test statistics...', flush=False) plt.figure() plt.hist(test_stats, 50) plt.savefig( '{0}/univariate_test_statistics.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(p_values, 50) plt.savefig('{0}/univariate_pvalues.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) plt.figure() plt.hist(-np.log10(p_values), 50) plt.savefig('{0}/univariate_nlps.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) else: print('skipping univariate feature selection...', flush=False) selected_indices = np.arange(dm.shape[1], dtype='int64') selected_features = dm.columnlabels.copy() print('selected {0!s} features...'.format(selected_features.size), flush=False) print('calculating class weights...', flush=False) pos_weight = np.sqrt(pos_neg_weight_ratio) * ( (Y[stage]['fit'].size) / 2 / (Y[stage]['fit'].sum()) ) # (assign weight to class)*(adjust for unbalanced classes) neg_weight = (1 / pos_weight) * ( (Y[stage]['fit'].size) / 2 / ((~Y[stage]['fit']).sum()) ) # (assign weight to class)*(adjust for unbalanced classes) class_weight = {True: pos_weight, False: neg_weight} print('fitting model...', flush=False) logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][:, selected_indices], Y[stage]['fit']) if stage == 'testing': print('plotting feature weights...', flush=False) iter_feature = DataMatrix( rowname='iteration', rowlabels=np.array( ['test{0!s}_valid{1!s}'.format(test_index, valid_index)], dtype='object'), rowmeta={ 'intercept': logistic_regression_model.intercept_, 'test_index': np.array([test_index], dtype='int64'), 'valid_index': np.array([valid_index], dtype='int64') }, columnname=dm.columnname, columnlabels=dm.columnlabels.copy(), columnmeta=copy.deepcopy(dm.columnmeta), matrixname='feature_weights', matrix=np.zeros((1, dm.shape[1]), dtype='float64')) feature_idx = {f: i for i, f in enumerate(dm.columnlabels)} for feature, weight in zip(selected_features, logistic_regression_model.coef_[0, :]): iter_feature.matrix[0, feature_idx[feature]] = weight plt.figure() plt.hist(iter_feature.matrix[0, :], 50) plt.savefig('{0}/feature_weights.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if feature_fraction < 1: plt.figure() plt.hist(iter_feature.matrix[0, selected_indices], 50) plt.savefig( '{0}/feature_weights_selected.png'.format(save_folder), transparent=True, pad_inches=0, dpi=100) if save_weights: print('saving feature weights...', flush=False) datasetIO.save_datamatrix( '{0}/iter_feature_datamatrix.txt.gz'.format(save_folder), iter_feature) print('creating datamatrix for performance statistics...', flush=False) stat_subset[stage] = DataMatrix( rowname='performance_statistic', rowlabels=np.array(performance_stats, dtype='object'), rowmeta={}, columnname='data_subset', columnlabels=np.array(data_subsets, dtype='object'), columnmeta={}, matrixname='classifier_performance_on_data_subsets', matrix=np.zeros((len(performance_stats), len(data_subsets)), dtype='float64')) for j, subset in enumerate(stat_subset[stage].columnlabels): print('evaluating performance on {0} subset...'.format(subset), flush=False) if valid_fraction == 'loo' and stage == 'validation' and subset == 'predict': P_pred = np.zeros(X[stage][subset].shape[0], dtype='float64') for train_index, test_index in LeaveOneOut().split( X[stage][subset]): logistic_regression_model = LogisticRegression( penalty=regularization_type, C=inverse_regularization_strength, intercept_scaling=intercept_scaling, class_weight=class_weight).fit( X[stage]['fit'][train_index, :][:, selected_indices], Y[stage]['fit'][train_index]) P_pred[ test_index] = logistic_regression_model.predict_proba( X[stage][subset][test_index, :][:, selected_indices] )[:, logistic_regression_model.classes_ == 1][0][0] else: P_pred = logistic_regression_model.predict_proba( X[stage][subset][:, selected_indices] )[:, logistic_regression_model.classes_ == 1] Y_pred = P_pred > 0.5 auroc = roc_auc_score(Y[stage][subset], P_pred) auprc = average_precision_score(Y[stage][subset], P_pred) brier = brier_score_loss(Y[stage][subset], P_pred) nll = log_loss(Y[stage][subset], P_pred) tn, fp, fn, tp = confusion_matrix(Y[stage][subset], Y_pred).ravel() # incorporate a prior with effective sample size = n_eff, where prior represents random predictions n_eff = 1 prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_eff * prevalence / 2 fn += n_eff * prevalence / 2 tn += n_eff * (1 - prevalence) / 2 fp += n_eff * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) precision = ppv recall = tpr f1 = (1 + (1**2)) * precision * recall / ((1**2) * precision + recall) f1_100 = (1 + (1 / 100**2)) * precision * recall / ( (1 / 100**2) * precision + recall) f1_50 = (1 + (1 / 50**2)) * precision * recall / ( (1 / 50**2) * precision + recall) f1_25 = (1 + (1 / 25**2)) * precision * recall / ( (1 / 25**2) * precision + recall) f1_10 = (1 + (1 / 10**2)) * precision * recall / ( (1 / 10**2) * precision + recall) f1_5 = (1 + (1 / 5**2)) * precision * recall / ( (1 / 5**2) * precision + recall) f1_3 = (1 + (1 / 3**2)) * precision * recall / ( (1 / 3**2) * precision + recall) f1_2 = (1 + (1 / 2**2)) * precision * recall / ( (1 / 2**2) * precision + recall) f2 = (1 + (2**2)) * precision * recall / ((2**2) * precision + recall) f3 = (1 + (3**2)) * precision * recall / ((3**2) * precision + recall) f5 = (1 + (5**2)) * precision * recall / ((5**2) * precision + recall) f10 = (1 + (10**2)) * precision * recall / ( (10**2) * precision + recall) f25 = (1 + (25**2)) * precision * recall / ( (25**2) * precision + recall) f50 = (1 + (50**2)) * precision * recall / ( (50**2) * precision + recall) f100 = (1 + (100**2)) * precision * recall / ( (100**2) * precision + recall) stat_subset[stage].matrix[:, j] = [ auroc, auprc, brier, nll, tp, fn, tn, fp, ap, an, pp, pn, n, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, mcc, fnlp, f1, f1_100, f1_50, f1_25, f1_10, f1_5, f1_3, f1_2, f2, f3, f5, f10, f25, f50, f100 ] print('saving performance statistics...', flush=False) datasetIO.save_datamatrix( '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(save_folder, stage), stat_subset[stage]) print('printing performance statistics...', flush=False) print('\t'.join(['stage', stat_subset[stage].rowname] + stat_subset[stage].columnlabels.tolist()), flush=False) for stat, vals in zip(stat_subset[stage].rowlabels, stat_subset[stage].matrix): print('\t'.join([stage, stat] + ['{0:1.3g}'.format(v) for v in vals]), flush=False) print('saving evaluation statistic...', flush=False) objective = stat_subset['validation'].select(evaluation_statistic, 'predict') with open('{0}/output.json'.format(save_folder), mode='wt', encoding='utf-8', errors='surrogateescape') as fw: json.dump(objective, fw, indent=2) print('done logistic_regression.py', flush=False)
def calc_f_table(x, M, n, N): return hypergeom.logsf(x, M, n, N) / math.log(10)
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count): print('begin calc_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # convert counts to float print('converting counts to float...') term_term.matrix = np.float64(term_term.matrix) term_term.updatedtypeattribute() for field, values in term_term.rowmeta.items(): if values.dtype == np.int64: term_term.rowmeta[field] = np.float64(values) for field, values in term_term.columnmeta.items(): if values.dtype == np.int64: term_term.columnmeta[field] = np.float64(values) # set universe size print('setting universe size...') if universe == 'intersectionunion' or universe == 'union': universe_size = term_term.rowmeta['all_count_{0}'.format(universe)][0] elif universe == 'medline': universe_size = 1e8 # 3e7 term_term.rowmeta['term_count_medline'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_medline'] = term_term.columnmeta[ 'term_count_union'].copy() elif universe == 'infinity': universe_size = 1e16 term_term.rowmeta['term_count_infinity'] = term_term.rowmeta[ 'term_count_union'].copy() term_term.columnmeta['term_count_infinity'] = term_term.columnmeta[ 'term_count_union'].copy() else: raise ValueError('invalid universe') # create matrices for select association statistics print('creating matrices for select association statistics...') selstats = ['mcc', 'mmcc', 'cos', 'mi', 'nmi', 'iqr'] statmats = {} for selstat in selstats: statmats[selstat] = np.zeros(term_term.shape, dtype='float64') # calculate association statistics and write to dataframe print('calculating association statistics and writing to dataframe...') dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) rowmetalabels = ['term_id', 'term_name'] rowmetaheaders = [ '{0}_id'.format(row_dictionary), '{0}_name'.format(row_dictionary) ] columnmetalabels = ['term_id', 'term_name'] columnmetaheaders = [ '{0}_id'.format(column_dictionary), '{0}_name'.format(column_dictionary) ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'mmcc', 'cos', 'fnlp', 'sig', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr' ] with gzip.open(dataframe_path, mode='wt', encoding='utf-8', errors='surrogateescape') as fw: writelist = ['{0}_dictidname'.format(row_dictionary) ] + rowmetaheaders + [ '{0}_dictidname'.format(column_dictionary) ] + columnmetaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for k, (i, j) in enumerate(zip(I, J)): if np.mod(k, 1000) == 0 or k + 1 == num_sufficient: print('working on term-term pair {0!s} of {1!s}...'.format( k + 1, num_sufficient)) # confusion matrix tp = term_term.matrix[i, j] fp = term_term.rowmeta['term_count_{0}'.format(universe)][i] - tp fn = term_term.columnmeta['term_count_{0}'.format( universe)][j] - tp tn = universe_size - (tp + fp + fn) # incorporate a random prior with effective sample size = n_prior, # where prior distribution conforms to empirical marginal distributions Rr = (tp + fp) / (fn + tn) # ratio of rows of confusion matrix Rc = (tp + fn) / (fp + tn) # ratio of columns of confusion matrix tp_prior = n_prior * Rc * Rr / ( Rc * Rr + Rr + Rc + 1 ) # solve for tp given constraints tp/fn=Rr, fp/tn=Rr, tp/fp=Rc, fn/tn=Rc, tp+fp+fn+tn=n_eff fp_prior = tp_prior / Rc fn_prior = tp_prior / Rr tn_prior = tp_prior / Rc / Rr tp += tp_prior fp += fp_prior fn += fn_prior tn += tn_prior ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) mmcc = 1 - np.sqrt( (fp * fn) / ((tp + fp) * (tp + fn)) ) # modified (by me), equivalent to 1 + mcc with tn forced to 0 cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) sig = fnlp > np.log10(term_term.size) - np.log10(0.05) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(fn) + np.log10( fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, mmcc, cos, fnlp, sig, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr ] rowwritelist = [term_term.rowlabels[i]] + [ term_term.rowmeta[l][i] if term_term.rowmeta[l].dtype == 'object' else str(term_term.rowmeta[l][i]) for l in rowmetalabels ] columnwritelist = [term_term.columnlabels[j]] + [ term_term.columnmeta[l][j] if term_term.columnmeta[l].dtype == 'object' else str(term_term.columnmeta[l][j]) for l in columnmetalabels ] writelist = rowwritelist + columnwritelist + [ str(s) for s in count_stats ] + ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') statmats['mcc'][i, j] = mcc statmats['mmcc'][i, j] = mmcc statmats['cos'][i, j] = cos statmats['mi'][i, j] = mi statmats['nmi'][i, j] = nmi statmats['iqr'][i, j] = iqr # save matrices for select association statistics print('saving matrices for select association statistics...') for selstat in selstats: term_term.matrix = statmats[selstat] datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.txt.gz' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle' .format(row_dictionary, column_dictionary, selstat, year, datestamp, min_score, universe, n_prior, min_count), term_term) print('done calc_term-term_stats_from_termite.py')
def test_enrichment(self, gene_ontology): features_of_interest = gene_ontology.all_genes[:10] test_enrichment_df = gene_ontology.enrichment(features_of_interest) p_value_cutoff = 1000000 min_feature_size = 3 min_background_size = 5 cross_reference = {} domains = gene_ontology.domains background = gene_ontology.all_genes n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in gene_ontology.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # TODO D.R.Y. this # Survival function is more accurate on small p-values log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) # p_value = 0 if p_value < 0 else p_value symbols = [cross_reference[f] if f in cross_reference else f for f in features_in_go] enrichment['negative_log_p_value'][go_term] = -log_p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][ go_term] = ','.join(features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] enrichment_df = pd.DataFrame(enrichment) # TODO D.R.Y. this # Bonferonni correction enrichment_df['bonferonni_corrected_negative_log_p_value'] = \ enrichment_df['negative_log_p_value'] \ - np.log(enrichment_df.shape[0]) ind = enrichment_df['bonferonni_corrected_negative_log_p_value' ] < np.log(p_value_cutoff) enrichment_df = enrichment_df.ix[ind] true_enrichment_df = enrichment_df.sort( columns=['negative_log_p_value'], ascending=False) pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
def _one_fit(self): if self.verbose: print("\nCreating synthetic doublets...") self._createDoublets() # Normalize combined augmented set if self.verbose: print("Normalizing...") if self.normalizer is not None: aug_counts = self.normalizer( sp_sparse.vstack((self._raw_counts, self._raw_synthetics))) else: # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts synth_lib_size = np.sum(self._raw_synthetics, axis=1).A1 aug_lib_size = np.concatenate([self._lib_size, synth_lib_size]) normed_synths = self._raw_synthetics.copy() inplace_csr_row_normalize_l1(normed_synths) aug_counts = sp_sparse.vstack( (self._normed_raw_counts, normed_synths)) aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1) self._norm_counts = aug_counts[:self._num_cells] self._synthetics = aug_counts[self._num_cells:] aug_counts = anndata.AnnData(aug_counts) aug_counts.obs["n_counts"] = aug_lib_size if self.standard_scaling is True: sc.pp.scale(aug_counts, max_value=15) if self.verbose: print("Running PCA...") sc.tl.pca(aug_counts, n_comps=self.n_components, random_state=self.random_state) if self.verbose: print("Clustering augmented data set...\n") if self.use_phenograph: f = io.StringIO() with redirect_stdout(f): fullcommunities, _, _ = phenograph.cluster( aug_counts.obsm["X_pca"], **self.phenograph_parameters) out = f.getvalue() if self.verbose: print(out) else: sc.pp.neighbors( aug_counts, random_state=self.random_state, method="umap", n_neighbors=10, ) sc.tl.louvain(aug_counts, random_state=self.random_state, resolution=4, directed=False) fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int) min_ID = min(fullcommunities) self.communities_ = fullcommunities[:self._num_cells] self.synth_communities_ = fullcommunities[self._num_cells:] community_sizes = [ np.count_nonzero(fullcommunities == i) for i in np.unique(fullcommunities) ] if self.verbose: print("Found clusters [{0}, ... {2}], with sizes: {1}\n".format( min(fullcommunities), community_sizes, max(fullcommunities))) # Count number of fake doublets in each community and assign score # Number of synth/orig cells in each cluster. synth_cells_per_comm = collections.Counter(self.synth_communities_) orig_cells_per_comm = collections.Counter(self.communities_) community_IDs = orig_cells_per_comm.keys() community_scores = { i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } scores = np.array([community_scores[i] for i in self.communities_]) community_log_p_values = { i: hypergeom.logsf( synth_cells_per_comm[i], aug_counts.shape[0], self._synthetics.shape[0], synth_cells_per_comm[i] + orig_cells_per_comm[i], ) for i in community_IDs } log_p_values = np.array( [community_log_p_values[i] for i in self.communities_]) if min_ID < 0: scores[self.communities_ == -1] = np.nan log_p_values[self.communities_ == -1] = np.nan return scores, log_p_values
def test_enrichment(self, gene_ontology): features_of_interest = gene_ontology.all_genes[:10] test_enrichment_df = gene_ontology.enrichment(features_of_interest) p_value_cutoff = 1000000 min_feature_size = 3 min_background_size = 5 cross_reference = {} domains = gene_ontology.domains background = gene_ontology.all_genes n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in gene_ontology.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # TODO D.R.Y. this # Survival function is more accurate on small p-values log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) # p_value = 0 if p_value < 0 else p_value symbols = [ cross_reference[f] if f in cross_reference else f for f in features_in_go ] enrichment['negative_log_p_value'][go_term] = -log_p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][go_term] = ','.join( features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] enrichment_df = pd.DataFrame(enrichment) # TODO D.R.Y. this # Bonferonni correction enrichment_df['bonferonni_corrected_negative_log_p_value'] = \ enrichment_df['negative_log_p_value'] \ - np.log(enrichment_df.shape[0]) ind = enrichment_df[ 'bonferonni_corrected_negative_log_p_value'] < np.log( p_value_cutoff) enrichment_df = enrichment_df.ix[ind] true_enrichment_df = enrichment_df.sort( columns=['negative_log_p_value'], ascending=False) pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
def main(dictionaries, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, reference_datamatrix_path, save_predictions): print('begin benchmark_term-term_stats_from_termite.py') print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1])) print('year: {0}'.format(year)) print('datestamp: {0}'.format(datestamp)) print('min_score: {0!s}'.format(min_score)) print('universe: {0}'.format(universe)) print('n_prior: {0!s}'.format(n_prior)) print('min_count: {0!s}'.format(min_count)) print('association_statistic: {0}'.format(association_statistic)) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print('save_predictions: {0!s}'.format(save_predictions)) # create figures folder print('creating figures folder...') figures_folder = 'benchmark_figures' if not os.path.exists(figures_folder): os.mkdir(figures_folder) # load counts datamatrix # this file is generated by count_term-term_pmids_from_termite.py print('loading counts datamatrix...') row_dictionary = dictionaries[ 0] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' column_dictionary = dictionaries[ 1] # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION' counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format( row_dictionary, column_dictionary, year, datestamp, min_score) term_term_counts_all = datasetIO.load_datamatrix(counts_datamatrix_path) print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path)) print(term_term_counts_all) # load association statistic datamatrix # this file is generated by calc_term-term_stats_from_termite.py print('loading association statistic datamatrix...') stats_datamatrix_path = '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'.format( row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count) term_term_stats_all = datasetIO.load_datamatrix(stats_datamatrix_path) print('stats_datamatrix_path: {0}'.format(stats_datamatrix_path)) print(term_term_stats_all) # load reference datamatrix of positive and negative examples print('loading reference datamatrix of positive and negative examples...') term_term_ref = datasetIO.load_datamatrix(reference_datamatrix_path) print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path)) print(term_term_ref) # align datamatrices to reference print('aligning datamatrices to reference...') term_term_counts = term_term_counts_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) term_term_stats = term_term_stats_all.tolabels( rowlabels=term_term_ref.rowlabels.copy(), columnlabels=term_term_ref.columnlabels.copy()) # find term-term pairs with sufficient counts print('finding term-term pairs with sufficient counts...') I, J = (term_term_counts.matrix >= min_count).nonzero() num_sufficient = I.size print('term-term pairs with at least {0!s} counts: {1!s}'.format( min_count, num_sufficient)) # find row_term_dicts and column_term_dicts print('finding row_term_dicts and column_term_dicts') row_term_dicts = np.unique(term_term_stats.rowmeta['term_dict']) column_term_dicts = np.unique(term_term_stats.columnmeta['term_dict']) # calculate performance on reference examples and write to dataframe print( 'calculating performance on reference examples and writing to dataframe...' ) dataframe_path = 'benchmark_term-term_stats_dataframe.txt' metaheaders = [ 'row_dictionary', 'column_dictionary', 'year', 'datestamp', 'min_score', 'universe', 'n_prior', 'min_count', 'association_statistic', 'reference_datamatrix_path', 'row_term_dict', 'column_term_dict' ] statheaders = [ 'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'auroc', 'auprc', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'cos', 'fnlp', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95', 'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95', 'dor_ub95', 'mi', 'nmi', 'iqr', 'min_value_association_statistic' ] with open(dataframe_path, mode='at', encoding='utf-8', errors='surrogateescape') as fw: writelist = metaheaders + statheaders fw.write('\t'.join(writelist) + '\n') for row_term_dict in row_term_dicts: row_hidxs = (term_term_stats.rowmeta['term_dict'] == row_term_dict ).nonzero()[0] for column_term_dict in column_term_dicts: print('working on {0}-{1} associations...'.format( row_term_dict, column_term_dict)) # get scores and labels print('getting scores and labels...') column_hidxs = (term_term_stats.columnmeta['term_dict'] == column_term_dict).nonzero()[0] hit = np.logical_and(np.in1d(I, row_hidxs), np.in1d(J, column_hidxs)) Y = term_term_ref.matrix[I[hit], J[hit]] X = (term_term_stats.matrix[I[hit], J[hit]]).reshape(-1, 1) X_prime = X.copy() if association_statistic == 'mcc': X_prime = (X_prime + 1) / 2 xpmin = (X_prime[X_prime > 0]).min() / 2 xpmax = 1 - (1 - (X_prime[X_prime < 1]).max()) / 2 X_prime[X_prime == 0] = xpmin X_prime[X_prime == 1] = xpmax logitX = np.log10(X_prime / (1 - X_prime)) # save score histograms print('saving score histograms...') values = X.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = logitX.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'logit ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # fit logistic regression classifier print('fitting logistic regression classifier...') robust_scaler = RobustScaler().fit(logitX) Z = robust_scaler.transform(logitX) logistic_regression_model = LogisticRegression( penalty='l2', C=1e3, intercept_scaling=1.0, class_weight='balanced').fit(Z, Y) if logistic_regression_model.classes_[1] == 1: decision_function = logistic_regression_model.decision_function( Z) else: decision_function = -logistic_regression_model.decision_function( Z) Y_pred = decision_function > 0 min_value_association_statistic = (X.reshape(-1)[Y_pred]).min() # save decision function and predicted probability histograms print( 'saving decision function and predicted probability histograms...' ) values = decision_function.reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False) save_path = '{0}/{1}_{2}_zoomhist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'decision fun ' + association_statistic, title, save_path, 'auto', (values.min(), values.max()), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) values = (1 / (1 + np.exp(-decision_function))).reshape(-1) title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format( universe[:5], row_term_dict[:5], column_term_dict[:5], np.median(values[Y]), np.median(values[~Y])) save_path = '{0}/{1}_{2}_hist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False) save_path = '{0}/{1}_{2}_zoomhist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) densities, edges = plot_step_density( { 'positive': ('-r', values[Y]), 'negative': (':b', values[~Y]) }, 'pred prob ' + association_statistic, title, save_path, 'auto', (0, 1), False, (np.percentile(values, 2.5), np.percentile(values, 97.5))) # compute roc and pr curves print('computing roc and pr curves...') fpr, tpr, thresholds = roc_curve(Y, decision_function) precision, recall, thresholds = precision_recall_curve( Y, decision_function) auroc = roc_auc_score(Y, decision_function) auprc = average_precision_score(Y, decision_function) # save roc and pr curves print('saving roc and pr curves...') title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auprc) save_path = '{0}/{1}_{2}_prc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(recall, precision, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format( universe[:5], association_statistic, row_term_dict[:5], column_term_dict[:5], auroc) save_path = '{0}/{1}_{2}_roc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format( figures_folder, row_dictionary, column_dictionary, association_statistic, year, datestamp, min_score, universe, n_prior, min_count, row_term_dict, column_term_dict) fg, ax = plt.subplots(1, 1, figsize=(3, 2)) ax.plot(fpr, tpr, '-k', linewidth=1) ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3, 1.3 / 2]) # left, bottom, width, height ax.set_title(title, fontsize=8) ax.set_ylabel('Precision', fontsize=8, labelpad=4) ax.set_xlabel('Recall', fontsize=8, labelpad=2) ax.set_ylim((0, 1)) ax.set_xlim((0, 1)) ax.tick_params(axis='both', which='major', bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False, labelsize=8) ax.ticklabel_format(axis='both', style='sci', scilimits=(-3, 3), fontsize=8) ax.yaxis.offsetText.set_fontsize(8) ax.xaxis.offsetText.set_fontsize(8) fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300) plt.close() # save predictions for all term-term pairs if save_predictions: print('saving predictions for all term-term pairs...') predictions = {} X_all = term_term_stats_all.matrix.reshape(-1, 1) if association_statistic == 'mcc': X_all = (X_all + 1) / 2 xamin = (X_all[X_all > 0]).min() / 2 xamax = 1 - (1 - (X_all[X_all < 1]).max()) / 2 X_all[X_all == 0] = xamin X_all[X_all == 1] = xamax logitX_all = np.log10(X_all / (1 - X_all)) Z_all = robust_scaler.transform(logitX_all) if logistic_regression_model.classes_[1] == 1: predictions[ 'decision_function'] = logistic_regression_model.decision_function( Z_all) else: predictions[ 'decision_function'] = -logistic_regression_model.decision_function( Z_all) predictions['probability_positive'] = 1 / ( 1 + np.exp(-predictions['decision_function'])) if not np.all(np.diff(thresholds) > 0): raise ValueError('thresholds not increasing') predictions['precision'] = np.interp( predictions['decision_function'], thresholds, precision[:-1]) predictions['recall'] = np.interp( predictions['decision_function'], thresholds, recall[:-1]) I0, J0 = (term_term_counts_all.matrix < min_count).nonzero() IA, JA = (term_term_counts_all.matrix >= min_count).nonzero() new_stats = [ '{0}_dictidname'.format(row_dictionary), '{0}_dictidname'.format(column_dictionary) ] new_stat_mat = np.concatenate( (term_term_counts_all.rowlabels[IA].reshape(-1, 1), term_term_counts_all.columnlabels[JA].reshape(-1, 1)), 1) for stat, values in predictions.items(): term_term_stats_all.matrix = values.reshape( term_term_stats_all.shape[0], term_term_stats_all.shape[1]) term_term_stats_all.matrix[I0, J0] = 0 datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.txt.gz' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) datasetIO.save_datamatrix( '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.pickle' .format(row_dictionary, column_dictionary, stat, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict), term_term_stats_all) new_stats.append(stat) new_stat_mat = np.append( new_stat_mat, (term_term_stats_all.matrix[IA, JA]).reshape(-1, 1), 1) new_df = pd.DataFrame(data=new_stat_mat, columns=new_stats) dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count) joined_dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}_as_{8}_rd_{9}_cd_{10}.txt.gz'.format( row_dictionary, column_dictionary, year, datestamp, min_score, universe, n_prior, min_count, association_statistic, row_term_dict, column_term_dict) df = pd.read_table(dataframe_path, compression='gzip', index_col=False) joined_df = df.set_index(new_stats[:2]).join( new_df.set_index(new_stats[:2])) joined_df.sort_values(by=association_statistic, ascending=False, inplace=True) joined_df.to_csv(joined_dataframe_path, sep='\t', compression='gzip') # compute classifier performance statistics # note, these are in-sample statistics # we are not worried about overfitting # because we only have one feature # and we are not trying to build a rigorous ML model # we are simply trying to answer the question, # given a reference set of positive and negative examples, # which association statistic ranks term-term pairs the best? print('computing classifier performance statistics...') tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel() # incorporate a random prior with effective sample size = n_prior prevalence = (tp + fn) / (tn + fp + fn + tp) tp += n_prior * prevalence / 2 fn += n_prior * prevalence / 2 tn += n_prior * (1 - prevalence) / 2 fp += n_prior * (1 - prevalence) / 2 ap = tp + fn an = fp + tn pp = tp + fp pn = tn + fn n = tn + fp + fn + tp tpr = tp / ap # sensitivity, recall fnr = fn / ap # 1-tpr, 1-sensitivity, 1-recall tnr = tn / an # specificity fpr = fp / an # 1-tnr, 1-specificity ppv = tp / pp # precision fdr = fp / pp # 1-ppv, 1-precision npv = tn / pn fomr = fn / pn # 1-npv acc = (tp + tn) / n mcr = (fp + fn) / n # 1-acc prev = ap / n plr = (tp / fp) / ( ap / an ) # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better nlr = (fn / tn) / ( ap / an ) # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better dor = (tp / fp) / ( fn / tn ) # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions drr = (tp / pp) / ( fn / pn ) # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions darr = (tp / pp) - ( fn / pn ) # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions mrr = (tp / pp) / ( ap / n ) # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample marr = (tp / pp) - ( ap / n ) # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr) mcc = (tp * tn - fp * fn) / np.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) cos = tp / np.sqrt((tp + fp) * (tp + fn)) # ochiai fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10) lrr = np.log10(tp) - np.log10(tp + fp) - np.log10( fn) + np.log10(fn + tn) # log10 of relative risk lrr_se = np.sqrt( fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log( 10) # standard error of log10 of relative risk lrr_lb95 = lrr - 1.96 * lrr_se lrr_ub95 = lrr + 1.96 * lrr_se drr_lb95 = 10**lrr_lb95 drr_ub95 = 10**lrr_ub95 lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10( tn) # log10 of odds ratio lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log( 10) # standard error of log10 of odds ratio lor_lb95 = lor - 1.96 * lor_se lor_ub95 = lor + 1.96 * lor_se dor_lb95 = 10**lor_lb95 dor_ub95 = 10**lor_ub95 mi, nmi, iqr = mutualinformation( tp, fp, fn, tn ) # mutual information, normalized mutual information, information quality ratio # write to dataframe print('writing to dataframe...') count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n] other_stats = [ auroc, auprc, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc, cos, fnlp, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr, min_value_association_statistic ] writelist = [ row_dictionary, column_dictionary, year, datestamp, str(min_score), universe, str(n_prior), str(min_count), association_statistic, reference_datamatrix_path, row_term_dict, column_term_dict ] writelist += [str(s) for s in count_stats] writelist += ['{0:1.5g}'.format(s) for s in other_stats] fw.write('\t'.join(writelist) + '\n') print('done benchmark_term-term_stats_from_termite.py')
def enrichment(self, features_of_interest, background=None, p_value_cutoff=1000000, cross_reference=None, min_feature_size=3, min_background_size=5, domain=None): """Bonferroni-corrected hypergeometric p-values of GO enrichment Calculates hypergeometric enrichment of the features of interest, in each GO category. Parameters ---------- features_of_interest : list-like List of features. Must match the identifiers in the ontology database exactly, i.e. if your ontology database is ENSEMBL ids, then you can only provide those and not common names like "RBFOX2" background : list-like, optional Background genes to use. It is best to use a relevant background such as all expressed genes. If None, defaults to all genes. p_value_cutoff : float, optional Maximum accepted Bonferroni-corrected p-value cross_reference : dict-like, optional A mapping of gene ids to gene symbols, e.g. a pandas Series of ENSEMBL genes e.g. ENSG00000139675 to gene symbols e.g HNRNPA1L2 min_feature_size : int, optional Minimum number of features of interest overlapping in a GO Term, to calculate enrichment min_background_size : int, optional Minimum number of features in the background overlapping a GO Term domain : str or list, optional Only calculate GO enrichment for a particular GO category or subset of categories. Valid domains: 'biological_process', 'molecular_function', 'cellular_component' Returns ------- enrichment_df : pandas.DataFrame A (n_go_categories, columns) DataFrame of the enrichment scores Raises ------ ValueError If features of interest and background do not overlap, or invalid GO domains are given """ cross_reference = {} if cross_reference is None else cross_reference background = self.all_genes if background is None else background if len(set(background) & set(features_of_interest)) == 0: raise ValueError('Features of interest and background do not ' 'overlap! Not calculating GO enrichment') if len(set(features_of_interest) & set(self.all_genes)) == 0: raise ValueError('Features of interest do not overlap with GO term' 'gene ids. Not calculating GO enrichment.') domains = self.domains valid_domains = ",".join("'{}'".format(x) for x in self.domains) # TODO more elegant type check if isinstance(domain, str) or isinstance(domain, basestring): if domain not in self.domains: raise ValueError( "'{}' is not a valid GO domain. " "Only {} are acceptable".format(domain, valid_domains)) domains = frozenset([domain]) elif isinstance(domain, Iterable): if len(set(domain) & self.domains) == 0: raise ValueError( "'{}' are not a valid GO domains. " "Only {} are acceptable".format( ",".join("'{}'".format(x) for x in domain), valid_domains)) domains = frozenset(domain) n_all_genes = len(background) n_features_of_interest = len(features_of_interest) enrichment = defaultdict(dict) for go_term, go_genes in self.ontology.items(): if go_genes['domain'] not in domains: continue features_in_go = go_genes['genes'].intersection( features_of_interest) background_in_go = go_genes['genes'].intersection(background) too_few_features = len(features_in_go) < min_feature_size too_few_background = len(background_in_go) < min_background_size if too_few_features or too_few_background: continue # Survival function is more accurate on small p-values log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes, len(background_in_go), n_features_of_interest) # p_value = 0 if p_value < 0 else p_value symbols = [cross_reference[f] if f in cross_reference else f for f in features_in_go] enrichment['negative_log_p_value'][go_term] = -log_p_value enrichment['n_features_of_interest_in_go_term'][go_term] = len( features_in_go) enrichment['n_background_in_go_term'][go_term] = len( background_in_go) enrichment['n_features_total_in_go_term'][go_term] = len( go_genes['genes']) enrichment['features_of_interest_in_go_term'][ go_term] = ','.join(features_in_go) enrichment['features_of_interest_in_go_term_gene_symbols'][ go_term] = ','.join(symbols) enrichment['go_domain'][go_term] = go_genes['domain'] enrichment['go_name'][go_term] = go_genes['name'] enrichment_df = pd.DataFrame(enrichment) if enrichment_df.empty: warnings.warn('No GO categories enriched in provided features') return # Bonferonni correction enrichment_df['bonferonni_corrected_negative_log_p_value'] = \ enrichment_df['negative_log_p_value'] \ - np.log(enrichment_df.shape[0]) ind = enrichment_df['bonferonni_corrected_negative_log_p_value' ] < np.log(p_value_cutoff) enrichment_df = enrichment_df.ix[ind] enrichment_df = enrichment_df.sort( columns=['negative_log_p_value'], ascending=False) return enrichment_df
def enrichment_significance(term_row): return LOG10_FACTOR * hypergeom.logsf(term_row['hit_count']-1, term_row['universe'], term_row['term_count'], term_row['list_size'])
def _one_fit(self): print("\nCreating synthetic doublets...") self._createDoublets() # Normalize combined augmented set print("Normalizing...") if self.normalizer is not None: aug_counts = self.normalizer( np.append(self._raw_counts, self._raw_synthetics, axis=0)) else: # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts synth_lib_size = np.sum(self._raw_synthetics, axis=1) aug_lib_size = np.concatenate([self._lib_size, synth_lib_size]) normed_synths = self._raw_synthetics / synth_lib_size[:, np.newaxis] aug_counts = np.concatenate( [self._normed_raw_counts, normed_synths], axis=0) aug_counts = np.log(aug_counts * np.median(aug_lib_size) + 0.1) self._norm_counts = aug_counts[:self._num_cells] self._synthetics = aug_counts[self._num_cells:] print("Running PCA...") # Get phenograph results pca = PCA(n_components=self.n_components, random_state=self.random_state) reduced_counts = pca.fit_transform(aug_counts) print("Clustering augmented data set with Phenograph...\n") fullcommunities, _, _ = phenograph.cluster( reduced_counts, **self.phenograph_parameters) min_ID = min(fullcommunities) self.communities_ = fullcommunities[:self._num_cells] self.synth_communities_ = fullcommunities[self._num_cells:] community_sizes = [ np.count_nonzero(fullcommunities == i) for i in np.unique(fullcommunities) ] print("Found communities [{0}, ... {2}], with sizes: {1}\n".format( min(fullcommunities), community_sizes, max(fullcommunities))) # Count number of fake doublets in each community and assign score # Number of synth/orig cells in each cluster. synth_cells_per_comm = collections.Counter(self.synth_communities_) orig_cells_per_comm = collections.Counter(self.communities_) community_IDs = orig_cells_per_comm.keys() community_scores = { i: float(synth_cells_per_comm[i]) / (synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } scores = np.array([community_scores[i] for i in self.communities_]) community_log_p_values = { i: hypergeom.logsf(synth_cells_per_comm[i], aug_counts.shape[0], self._synthetics.shape[0], synth_cells_per_comm[i] + orig_cells_per_comm[i]) for i in community_IDs } log_p_values = np.array( [community_log_p_values[i] for i in self.communities_]) if min_ID < 0: scores[self.communities_ == -1] = np.nan log_p_values[self.communities_ == -1] = np.nan return scores, log_p_values