Exemple #1
0
 def analysis_go(self, s_go, S_hit, N_total=0, SRC_GENE=None, min_overlap=3):
     c={'GO':s_go, '#TotalGeneInLibrary':N_total, '#GeneInGO':0, '#GeneInHitList':0, '#GeneInGOAndHitList':0, 'LogP':0.0, 'Enrichment':0}
     #if SRC_GENE is not None:
     #    print "SRC_GENE: "+str(len(SRC_GENE))
     S_gene=self.GO_GENE[s_go]
     if not N_total:
         N_total=len(self.ALL_GENE)
     if SRC_GENE is not None:
         S_gene=S_gene.intersection(SRC_GENE)
         S_hit=set(S_hit).intersection(SRC_GENE)
     else:
         S_hit=set(S_hit)
     c['#GeneInGO']=len(S_gene)
     c['#GeneInHitList']=len(S_hit)
     if c['#GeneInGO']<min_overlap or c['#GeneInHitList']<min_overlap:
         return None
     S_both=S_gene.intersection(S_hit)
     c['#GeneInGOAndHitList']=len(S_both)
     if c['#GeneInGOAndHitList']<min_overlap:
         return None
     c['%InGO']=c['#GeneInGOAndHitList']*100.0/c['#GeneInHitList']
     q=min(max(c['%InGO']/100, 1.0/c['#GeneInHitList']), 1-1.0/c['#GeneInHitList'])
     c['STDV %InGO']=np.sqrt(q*(1-q)/c['#GeneInHitList'])*100
     c['Enrichment']=c['%InGO']/100.0*N_total/c['#GeneInGO']
     S=[int(x) for x in S_both]
     S.sort()
     c['GeneID']='|'.join([str(x) for x in S])
     if c['#GeneInGOAndHitList']<min_overlap: return c
     c['LogP'] = hypergeom.logsf(c['#GeneInGOAndHitList']-1, N_total, c['#GeneInGO'], c['#GeneInHitList'])
     c['LogP'] = c['LogP'] / TO_LOG10
     return c
Exemple #2
0
    def get_single_enrichment(self, gene_list, term, gene_universe=15000, gene_list_size=None):
        term_df = self.gene_term_df[self.gene_term_df[self.go_column_label] == term]
        n = len(term_df)
        x = sum(term_df[self.gene_column_label].isin(gene_list)) - 1  # matched genes
        N = gene_list_size if gene_list_size else len(gene_list)

        return hypergeom.logsf(x, gene_universe, n, N)
Exemple #3
0
def _prob_hypergeo_fast(y_compute, name, X, M, n, N):
    """Compute hypergeometric Pvalue.

    Description
    -----------
    Suppose you have a lot of 100 floppy disks (M), and you know that 20 of them are defective (n).
    What is the prbability of drawing zero to 2 floppy disks (N=2), if you select 10 at random (N).
    P=hypergeom.sf(2,100,20,10)

    """
    P = np.nan
    logP = np.nan
    # M = len(yc)  # Population size: Total number of samples, eg total number of genes; 10000
    # n = np.sum(datac)  # Number of successes in population, known in pathway, eg 2000
    # N = np.sum(yc)  # sample size: Random variate, eg clustersize or groupsize, over expressed genes, eg 300
    # X = np.sum(np.logical_and(yc, datac.values)) - 1  # Let op, de -1 is belangrijk omdatje P<X wilt weten ipv P<=X. Als je P<=X doet dan kan je vele false positives krijgen als bijvoorbeeld X=1 en n=1 oid

    # Do the hypergeo-test
    if y_compute and (X > 0):
        P = hypergeom.sf(X, M, n, N)
        logP = hypergeom.logsf(X, M, n, N)

    # Store
    out = {}
    out['category_label'] = name
    out['P'] = P
    out['logP'] = logP
    out['overlap_X'] = X
    out['popsize_M'] = M
    out['nr_succes_pop_n'] = n
    out['samplesize_N'] = N
    out['dtype'] = 'categorical'
    return (out)
def calculate_enrichment(gene_data, go_to_genes, n=100):

    # get top n gene list
    top_sorted_genes = sorted(gene_data, key=lambda tup: tup[1], reverse=True)
    top_genes=top_sorted_genes[:n]
    top_gene_names= list(zip(*top_genes)[0])

    # get bottom n gene list
    bot_sorted_genes = sorted(gene_data, key=lambda tup: tup[1])
    bot_genes=bot_sorted_genes[:n]
    bot_gene_names= list(zip(*bot_genes)[0])

    tot_genes = len(gene_data)

    # [+ top + goid (k), total genes (M), top genes (n), goid genes (N), score ]
    # create score dictionary
    top_score_list = dict((k,[0, tot_genes, n, len(go_to_genes[k]), 0]) for k in go_to_genes.keys())
    bot_score_list = dict((k,[0, tot_genes, n, len(go_to_genes[k]), 0]) for k in go_to_genes.keys())

    # calculate top hits
    for g in top_gene_names:
        for goid in top_score_list:
            if g in go_to_genes[goid]:
                top_score_list[goid][0] += 1

    # calculate bottom hits
    for g in bot_gene_names:
        for goid in bot_score_list:
            if g in go_to_genes[goid]:
                bot_score_list[goid][0] += 1

    positive_enrichment_scores = []
    negative_enrichment_scores = []

    # calculate scores
    for goid in top_score_list:
        top_score_list[goid][4] = hypergeom.logsf(top_score_list[goid][0]-1, top_score_list[goid][1], top_score_list[goid][2], top_score_list[goid][3])
        positive_enrichment_scores.append((goid, top_score_list[goid][4]))
        bot_score_list[goid][4] = hypergeom.logsf(bot_score_list[goid][0]-1, bot_score_list[goid][1], bot_score_list[goid][2], bot_score_list[goid][3])
        negative_enrichment_scores.append((goid, bot_score_list[goid][4]))
            
    # 

	positive_enrichment_scores_sorted = sorted(positive_enrichment_scores, key=lambda tup: tup[1])
	negative_enrichment_scores_sorted = sorted(negative_enrichment_scores, key=lambda tup: tup[1])

    return positive_enrichment_scores_sorted,negative_enrichment_scores_sorted
Exemple #5
0
 def calc_enrich(self, path_file, all_sigs=True):
     #print('calc_enrich')
     hw_temp = self.hw_genes_all
     if (not all_sigs):
         hw_temp = self.hw_genes_all
     kegg = pandas.read_csv(path_file, header=None, sep='\t')
     temp_kegg_en = [1] * hw_temp.shape[1]
     for i in range(hw_temp.shape[1]):
         #print(i)
         path_en = []
         sig_genes = self.compendium.index[np.where(hw_temp[:, i])]
         for j in range(kegg.shape[0]):
             path_genes = kegg[2][j].split(';')
             x = len(list(set(sig_genes) & set(path_genes))) - 1
             n = self.weights.shape[0]
             k = len(sig_genes)
             m = len(path_genes)
             p = hypergeom.logsf(x, n, k, m)
             path_en.append(-p)
         #path_en_c = multipletests(path_en)[1]
         temp_kegg_en[i] = path_en
     kegg_df = pandas.DataFrame(temp_kegg_en, columns=kegg[0])
     return (kegg_df.replace([np.inf, -np.inf, 'nan'], 0))
Exemple #6
0
def main(datamatrix_path, test_index, response_variable_name, valid_index,
         valid_fraction, feature_fraction, regularization_type,
         inverse_regularization_strength, intercept_scaling,
         pos_neg_weight_ratio, evaluation_statistic, save_weights, save_folder,
         datamatrix):

    print('loading datamatrix...', flush=False)
    if datamatrix == None or type(datamatrix) == str:
        dm = datasetIO.load_datamatrix(datamatrix_path)
    else:
        dm = datamatrix

    print('setting random seed with test_index {0!s}...'.format(test_index),
          flush=False)
    np.random.seed(test_index)

    print('getting bootstrap sample...', flush=False)
    all_indices = np.arange(dm.shape[0])
    boot_indices = np.random.choice(dm.shape[0], dm.shape[0], replace=True)
    test_indices = all_indices[~np.in1d(all_indices, boot_indices)]

    print('reserving out-of-bag samples as test set...', flush=False)
    Y = {
        'test': dm.rowmeta[response_variable_name][test_indices].astype('bool')
    }
    X = {'test': dm.matrix[test_indices, :]}

    print('setting random seed with valid_index {0!s}...'.format(valid_index),
          flush=False)
    np.random.seed(valid_index)

    print('splitting bootstrap sample into training and validation sets...',
          flush=False)
    if type(valid_fraction) == str and (valid_fraction.lower() == 'loo'
                                        or valid_fraction.lower() == 'loocv'):
        valid_fraction = 'loo'
        valid_indices = all_indices
        train_indices = all_indices
    else:
        valid_indices = np.random.choice(dm.shape[0],
                                         round(valid_fraction * dm.shape[0]),
                                         replace=False)
        train_indices = all_indices[~np.in1d(all_indices, valid_indices)]

    Y['train'] = dm.rowmeta[response_variable_name][boot_indices][
        train_indices].astype('bool')
    Y['valid'] = dm.rowmeta[response_variable_name][boot_indices][
        valid_indices].astype('bool')
    X['train'] = dm.matrix[boot_indices, :][train_indices, :]
    X['valid'] = dm.matrix[boot_indices, :][valid_indices, :]

    print('fitting and evaluating models...', flush=False)
    stages = ['validation', 'testing']
    data_subsets = ['fit', 'predict']
    performance_stats = [
        'auroc', 'auprc', 'brier', 'nll', 'tp', 'fn', 'tn', 'fp', 'ap', 'an',
        'pp', 'pn', 'n', 'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv',
        'fomr', 'acc', 'mcr', 'prev', 'plr', 'nlr', 'dor', 'drr', 'darr',
        'mrr', 'marr', 'mcc', 'fnlp', 'f1', 'f1_100', 'f1_50', 'f1_25',
        'f1_10', 'f1_5', 'f1_3', 'f1_2', 'f2', 'f3', 'f5', 'f10', 'f25', 'f50',
        'f100'
    ]
    if valid_fraction == 'loo':
        X.update({
            'validation': {
                'fit': X['train'],
                'predict': X['valid']
            },
            'testing': {
                'fit': X['train'],
                'predict': X['test']
            }
        })
        Y.update({
            'validation': {
                'fit': Y['train'],
                'predict': Y['valid']
            },
            'testing': {
                'fit': Y['train'],
                'predict': Y['test']
            }
        })
    else:
        X.update({
            'validation': {
                'fit': X['train'],
                'predict': X['valid']
            },
            'testing': {
                'fit': np.append(X['train'], X['valid'], 0),
                'predict': X['test']
            }
        })
        Y.update({
            'validation': {
                'fit': Y['train'],
                'predict': Y['valid']
            },
            'testing': {
                'fit': np.append(Y['train'], Y['valid']),
                'predict': Y['test']
            }
        })
    stat_subset = {}
    for stage in stages:
        print('working on {0} stage...'.format(stage), flush=False)

        if feature_fraction < 1:
            print('performing univariate feature selection...', flush=False)
            num_features = round(feature_fraction * dm.shape[1])
            test_stats, p_values = ttest_ind(
                X[stage]['fit'][Y[stage]['fit'], :],
                X[stage]['fit'][~Y[stage]['fit'], :],
                axis=0,
                equal_var=False,
                nan_policy='propagate')
            ranks = np.argsort(p_values)
            selected_indices = ranks[:num_features]
            selected_features = dm.columnlabels[selected_indices]
            if stage == 'testing':
                print('plotting univariate test statistics...', flush=False)
                plt.figure()
                plt.hist(test_stats, 50)
                plt.savefig(
                    '{0}/univariate_test_statistics.png'.format(save_folder),
                    transparent=True,
                    pad_inches=0,
                    dpi=100)
                plt.figure()
                plt.hist(p_values, 50)
                plt.savefig('{0}/univariate_pvalues.png'.format(save_folder),
                            transparent=True,
                            pad_inches=0,
                            dpi=100)
                plt.figure()
                plt.hist(-np.log10(p_values), 50)
                plt.savefig('{0}/univariate_nlps.png'.format(save_folder),
                            transparent=True,
                            pad_inches=0,
                            dpi=100)
        else:
            print('skipping univariate feature selection...', flush=False)
            selected_indices = np.arange(dm.shape[1], dtype='int64')
            selected_features = dm.columnlabels.copy()
        print('selected {0!s} features...'.format(selected_features.size),
              flush=False)

        print('calculating class weights...', flush=False)
        pos_weight = np.sqrt(pos_neg_weight_ratio) * (
            (Y[stage]['fit'].size) / 2 / (Y[stage]['fit'].sum())
        )  # (assign weight to class)*(adjust for unbalanced classes)
        neg_weight = (1 / pos_weight) * (
            (Y[stage]['fit'].size) / 2 / ((~Y[stage]['fit']).sum())
        )  # (assign weight to class)*(adjust for unbalanced classes)
        class_weight = {True: pos_weight, False: neg_weight}

        print('fitting model...', flush=False)
        logistic_regression_model = LogisticRegression(
            penalty=regularization_type,
            C=inverse_regularization_strength,
            intercept_scaling=intercept_scaling,
            class_weight=class_weight).fit(
                X[stage]['fit'][:, selected_indices], Y[stage]['fit'])

        if stage == 'testing':
            print('plotting feature weights...', flush=False)
            iter_feature = DataMatrix(
                rowname='iteration',
                rowlabels=np.array(
                    ['test{0!s}_valid{1!s}'.format(test_index, valid_index)],
                    dtype='object'),
                rowmeta={
                    'intercept': logistic_regression_model.intercept_,
                    'test_index': np.array([test_index], dtype='int64'),
                    'valid_index': np.array([valid_index], dtype='int64')
                },
                columnname=dm.columnname,
                columnlabels=dm.columnlabels.copy(),
                columnmeta=copy.deepcopy(dm.columnmeta),
                matrixname='feature_weights',
                matrix=np.zeros((1, dm.shape[1]), dtype='float64'))
            feature_idx = {f: i for i, f in enumerate(dm.columnlabels)}
            for feature, weight in zip(selected_features,
                                       logistic_regression_model.coef_[0, :]):
                iter_feature.matrix[0, feature_idx[feature]] = weight
            plt.figure()
            plt.hist(iter_feature.matrix[0, :], 50)
            plt.savefig('{0}/feature_weights.png'.format(save_folder),
                        transparent=True,
                        pad_inches=0,
                        dpi=100)
            if feature_fraction < 1:
                plt.figure()
                plt.hist(iter_feature.matrix[0, selected_indices], 50)
                plt.savefig(
                    '{0}/feature_weights_selected.png'.format(save_folder),
                    transparent=True,
                    pad_inches=0,
                    dpi=100)

            if save_weights:
                print('saving feature weights...', flush=False)
                datasetIO.save_datamatrix(
                    '{0}/iter_feature_datamatrix.txt.gz'.format(save_folder),
                    iter_feature)

        print('creating datamatrix for performance statistics...', flush=False)
        stat_subset[stage] = DataMatrix(
            rowname='performance_statistic',
            rowlabels=np.array(performance_stats, dtype='object'),
            rowmeta={},
            columnname='data_subset',
            columnlabels=np.array(data_subsets, dtype='object'),
            columnmeta={},
            matrixname='classifier_performance_on_data_subsets',
            matrix=np.zeros((len(performance_stats), len(data_subsets)),
                            dtype='float64'))

        for j, subset in enumerate(stat_subset[stage].columnlabels):
            print('evaluating performance on {0} subset...'.format(subset),
                  flush=False)
            if valid_fraction == 'loo' and stage == 'validation' and subset == 'predict':
                P_pred = np.zeros(X[stage][subset].shape[0], dtype='float64')
                for train_index, test_index in LeaveOneOut().split(
                        X[stage][subset]):
                    logistic_regression_model = LogisticRegression(
                        penalty=regularization_type,
                        C=inverse_regularization_strength,
                        intercept_scaling=intercept_scaling,
                        class_weight=class_weight).fit(
                            X[stage]['fit'][train_index, :][:,
                                                            selected_indices],
                            Y[stage]['fit'][train_index])
                    P_pred[
                        test_index] = logistic_regression_model.predict_proba(
                            X[stage][subset][test_index, :][:,
                                                            selected_indices]
                        )[:, logistic_regression_model.classes_ == 1][0][0]
            else:
                P_pred = logistic_regression_model.predict_proba(
                    X[stage][subset][:, selected_indices]
                )[:, logistic_regression_model.classes_ == 1]
            Y_pred = P_pred > 0.5

            auroc = roc_auc_score(Y[stage][subset], P_pred)
            auprc = average_precision_score(Y[stage][subset], P_pred)
            brier = brier_score_loss(Y[stage][subset], P_pred)
            nll = log_loss(Y[stage][subset], P_pred)

            tn, fp, fn, tp = confusion_matrix(Y[stage][subset], Y_pred).ravel()

            # incorporate a prior with effective sample size = n_eff, where prior represents random predictions
            n_eff = 1
            prevalence = (tp + fn) / (tn + fp + fn + tp)
            tp += n_eff * prevalence / 2
            fn += n_eff * prevalence / 2
            tn += n_eff * (1 - prevalence) / 2
            fp += n_eff * (1 - prevalence) / 2

            ap = tp + fn
            an = fp + tn
            pp = tp + fp
            pn = tn + fn
            n = tn + fp + fn + tp

            tpr = tp / ap  # sensitivity, recall
            fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
            tnr = tn / an  # specificity
            fpr = fp / an  # 1-tnr, 1-specificity

            ppv = tp / pp  # precision
            fdr = fp / pp  # 1-ppv, 1-precision
            npv = tn / pn
            fomr = fn / pn  # 1-npv

            acc = (tp + tn) / n
            mcr = (fp + fn) / n  # 1-acc
            prev = ap / n

            plr = (tp / fp) / (
                ap / an
            )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
            nlr = (fn / tn) / (
                ap / an
            )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
            dor = (tp / fp) / (
                fn / tn
            )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
            drr = (tp / pp) / (
                fn / pn
            )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
            darr = (tp / pp) - (
                fn / pn
            )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
            mrr = (tp / pp) / (
                ap / n
            )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
            marr = (tp / pp) - (
                ap / n
            )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

            mcc = (tp * tn - fp * fn) / np.sqrt(
                (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
            fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)

            precision = ppv
            recall = tpr
            f1 = (1 +
                  (1**2)) * precision * recall / ((1**2) * precision + recall)
            f1_100 = (1 + (1 / 100**2)) * precision * recall / (
                (1 / 100**2) * precision + recall)
            f1_50 = (1 + (1 / 50**2)) * precision * recall / (
                (1 / 50**2) * precision + recall)
            f1_25 = (1 + (1 / 25**2)) * precision * recall / (
                (1 / 25**2) * precision + recall)
            f1_10 = (1 + (1 / 10**2)) * precision * recall / (
                (1 / 10**2) * precision + recall)
            f1_5 = (1 + (1 / 5**2)) * precision * recall / (
                (1 / 5**2) * precision + recall)
            f1_3 = (1 + (1 / 3**2)) * precision * recall / (
                (1 / 3**2) * precision + recall)
            f1_2 = (1 + (1 / 2**2)) * precision * recall / (
                (1 / 2**2) * precision + recall)
            f2 = (1 +
                  (2**2)) * precision * recall / ((2**2) * precision + recall)
            f3 = (1 +
                  (3**2)) * precision * recall / ((3**2) * precision + recall)
            f5 = (1 +
                  (5**2)) * precision * recall / ((5**2) * precision + recall)
            f10 = (1 + (10**2)) * precision * recall / (
                (10**2) * precision + recall)
            f25 = (1 + (25**2)) * precision * recall / (
                (25**2) * precision + recall)
            f50 = (1 + (50**2)) * precision * recall / (
                (50**2) * precision + recall)
            f100 = (1 + (100**2)) * precision * recall / (
                (100**2) * precision + recall)

            stat_subset[stage].matrix[:, j] = [
                auroc, auprc, brier, nll, tp, fn, tn, fp, ap, an, pp, pn, n,
                tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr,
                nlr, dor, drr, darr, mrr, marr, mcc, fnlp, f1, f1_100, f1_50,
                f1_25, f1_10, f1_5, f1_3, f1_2, f2, f3, f5, f10, f25, f50, f100
            ]

        print('saving performance statistics...', flush=False)
        datasetIO.save_datamatrix(
            '{0}/stat_subset_datamatrix_{1}.txt.gz'.format(save_folder, stage),
            stat_subset[stage])

        print('printing performance statistics...', flush=False)
        print('\t'.join(['stage', stat_subset[stage].rowname] +
                        stat_subset[stage].columnlabels.tolist()),
              flush=False)
        for stat, vals in zip(stat_subset[stage].rowlabels,
                              stat_subset[stage].matrix):
            print('\t'.join([stage, stat] +
                            ['{0:1.3g}'.format(v) for v in vals]),
                  flush=False)

    print('saving evaluation statistic...', flush=False)
    objective = stat_subset['validation'].select(evaluation_statistic,
                                                 'predict')
    with open('{0}/output.json'.format(save_folder),
              mode='wt',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        json.dump(objective, fw, indent=2)

    print('done logistic_regression.py', flush=False)
Exemple #7
0
def calc_f_table(x, M, n, N):
    return hypergeom.logsf(x, M, n, N) / math.log(10)
Exemple #8
0
def main(dictionaries, year, datestamp, min_score, universe, n_prior,
         min_count):

    print('begin calc_term-term_stats_from_termite.py')

    print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1]))
    print('year: {0}'.format(year))
    print('datestamp: {0}'.format(datestamp))
    print('min_score: {0!s}'.format(min_score))
    print('universe: {0}'.format(universe))
    print('n_prior: {0!s}'.format(n_prior))
    print('min_count: {0!s}'.format(min_count))

    # load counts datamatrix
    # this file is generated by count_term-term_pmids_from_termite.py
    print('loading counts datamatrix...')
    row_dictionary = dictionaries[
        0]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    column_dictionary = dictionaries[
        1]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score)
    term_term = datasetIO.load_datamatrix(counts_datamatrix_path)
    print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path))
    print(term_term)

    # find term-term pairs with sufficient counts
    print('finding term-term pairs with sufficient counts...')
    I, J = (term_term.matrix >= min_count).nonzero()
    num_sufficient = I.size
    print('term-term pairs with at least {0!s} counts: {1!s}'.format(
        min_count, num_sufficient))

    # convert counts to float
    print('converting counts to float...')
    term_term.matrix = np.float64(term_term.matrix)
    term_term.updatedtypeattribute()
    for field, values in term_term.rowmeta.items():
        if values.dtype == np.int64:
            term_term.rowmeta[field] = np.float64(values)
    for field, values in term_term.columnmeta.items():
        if values.dtype == np.int64:
            term_term.columnmeta[field] = np.float64(values)

    # set universe size
    print('setting universe size...')
    if universe == 'intersectionunion' or universe == 'union':
        universe_size = term_term.rowmeta['all_count_{0}'.format(universe)][0]
    elif universe == 'medline':
        universe_size = 1e8  # 3e7
        term_term.rowmeta['term_count_medline'] = term_term.rowmeta[
            'term_count_union'].copy()
        term_term.columnmeta['term_count_medline'] = term_term.columnmeta[
            'term_count_union'].copy()
    elif universe == 'infinity':
        universe_size = 1e16
        term_term.rowmeta['term_count_infinity'] = term_term.rowmeta[
            'term_count_union'].copy()
        term_term.columnmeta['term_count_infinity'] = term_term.columnmeta[
            'term_count_union'].copy()
    else:
        raise ValueError('invalid universe')

    # create matrices for select association statistics
    print('creating matrices for select association statistics...')
    selstats = ['mcc', 'mmcc', 'cos', 'mi', 'nmi', 'iqr']
    statmats = {}
    for selstat in selstats:
        statmats[selstat] = np.zeros(term_term.shape, dtype='float64')

    # calculate association statistics and write to dataframe
    print('calculating association statistics and writing to dataframe...')
    dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score,
        universe, n_prior, min_count)
    rowmetalabels = ['term_id', 'term_name']
    rowmetaheaders = [
        '{0}_id'.format(row_dictionary), '{0}_name'.format(row_dictionary)
    ]
    columnmetalabels = ['term_id', 'term_name']
    columnmetaheaders = [
        '{0}_id'.format(column_dictionary),
        '{0}_name'.format(column_dictionary)
    ]
    statheaders = [
        'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'tpr', 'fnr',
        'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr', 'prev', 'plr',
        'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc', 'mmcc', 'cos',
        'fnlp', 'sig', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95',
        'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95',
        'dor_ub95', 'mi', 'nmi', 'iqr'
    ]
    with gzip.open(dataframe_path,
                   mode='wt',
                   encoding='utf-8',
                   errors='surrogateescape') as fw:
        writelist = ['{0}_dictidname'.format(row_dictionary)
                     ] + rowmetaheaders + [
                         '{0}_dictidname'.format(column_dictionary)
                     ] + columnmetaheaders + statheaders
        fw.write('\t'.join(writelist) + '\n')
        for k, (i, j) in enumerate(zip(I, J)):
            if np.mod(k, 1000) == 0 or k + 1 == num_sufficient:
                print('working on term-term pair {0!s} of {1!s}...'.format(
                    k + 1, num_sufficient))

            # confusion matrix
            tp = term_term.matrix[i, j]
            fp = term_term.rowmeta['term_count_{0}'.format(universe)][i] - tp
            fn = term_term.columnmeta['term_count_{0}'.format(
                universe)][j] - tp
            tn = universe_size - (tp + fp + fn)

            # incorporate a random prior with effective sample size = n_prior,
            # where prior distribution conforms to empirical marginal distributions
            Rr = (tp + fp) / (fn + tn)  # ratio of rows of confusion matrix
            Rc = (tp + fn) / (fp + tn)  # ratio of columns of confusion matrix
            tp_prior = n_prior * Rc * Rr / (
                Rc * Rr + Rr + Rc + 1
            )  # solve for tp given constraints tp/fn=Rr, fp/tn=Rr, tp/fp=Rc, fn/tn=Rc, tp+fp+fn+tn=n_eff
            fp_prior = tp_prior / Rc
            fn_prior = tp_prior / Rr
            tn_prior = tp_prior / Rc / Rr
            tp += tp_prior
            fp += fp_prior
            fn += fn_prior
            tn += tn_prior

            ap = tp + fn
            an = fp + tn
            pp = tp + fp
            pn = tn + fn
            n = tn + fp + fn + tp

            tpr = tp / ap  # sensitivity, recall
            fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
            tnr = tn / an  # specificity
            fpr = fp / an  # 1-tnr, 1-specificity

            ppv = tp / pp  # precision
            fdr = fp / pp  # 1-ppv, 1-precision
            npv = tn / pn
            fomr = fn / pn  # 1-npv

            acc = (tp + tn) / n
            mcr = (fp + fn) / n  # 1-acc
            prev = ap / n

            plr = (tp / fp) / (
                ap / an
            )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
            nlr = (fn / tn) / (
                ap / an
            )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
            dor = (tp / fp) / (
                fn / tn
            )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
            drr = (tp / pp) / (
                fn / pn
            )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
            darr = (tp / pp) - (
                fn / pn
            )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
            mrr = (tp / pp) / (
                ap / n
            )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
            marr = (tp / pp) - (
                ap / n
            )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

            f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr)
            mcc = (tp * tn - fp * fn) / np.sqrt(
                (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
            mmcc = 1 - np.sqrt(
                (fp * fn) / ((tp + fp) * (tp + fn))
            )  # modified (by me), equivalent to 1 + mcc with tn forced to 0
            cos = tp / np.sqrt((tp + fp) * (tp + fn))  # ochiai
            fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)
            sig = fnlp > np.log10(term_term.size) - np.log10(0.05)

            lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(fn) + np.log10(
                fn + tn)  # log10 of relative risk
            lrr_se = np.sqrt(
                fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log(
                    10)  # standard error of log10 of relative risk
            lrr_lb95 = lrr - 1.96 * lrr_se
            lrr_ub95 = lrr + 1.96 * lrr_se
            drr_lb95 = 10**lrr_lb95
            drr_ub95 = 10**lrr_ub95

            lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10(
                tn)  # log10 of odds ratio
            lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log(
                10)  # standard error of log10 of odds ratio
            lor_lb95 = lor - 1.96 * lor_se
            lor_ub95 = lor + 1.96 * lor_se
            dor_lb95 = 10**lor_lb95
            dor_ub95 = 10**lor_ub95

            mi, nmi, iqr = mutualinformation(
                tp, fp, fn, tn
            )  # mutual information, normalized mutual information, information quality ratio

            count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n]
            other_stats = [
                tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc, mcr, prev, plr,
                nlr, dor, drr, darr, mrr, marr, f1, mcc, mmcc, cos, fnlp, sig,
                lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95, drr_ub95, lor,
                lor_se, lor_lb95, lor_ub95, dor_lb95, dor_ub95, mi, nmi, iqr
            ]

            rowwritelist = [term_term.rowlabels[i]] + [
                term_term.rowmeta[l][i] if term_term.rowmeta[l].dtype
                == 'object' else str(term_term.rowmeta[l][i])
                for l in rowmetalabels
            ]
            columnwritelist = [term_term.columnlabels[j]] + [
                term_term.columnmeta[l][j] if term_term.columnmeta[l].dtype
                == 'object' else str(term_term.columnmeta[l][j])
                for l in columnmetalabels
            ]
            writelist = rowwritelist + columnwritelist + [
                str(s) for s in count_stats
            ] + ['{0:1.5g}'.format(s) for s in other_stats]
            fw.write('\t'.join(writelist) + '\n')

            statmats['mcc'][i, j] = mcc
            statmats['mmcc'][i, j] = mmcc
            statmats['cos'][i, j] = cos
            statmats['mi'][i, j] = mi
            statmats['nmi'][i, j] = nmi
            statmats['iqr'][i, j] = iqr

    # save matrices for select association statistics
    print('saving matrices for select association statistics...')
    for selstat in selstats:
        term_term.matrix = statmats[selstat]
        datasetIO.save_datamatrix(
            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.txt.gz'
            .format(row_dictionary, column_dictionary, selstat, year,
                    datestamp, min_score, universe, n_prior, min_count),
            term_term)
        datasetIO.save_datamatrix(
            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'
            .format(row_dictionary, column_dictionary, selstat, year,
                    datestamp, min_score, universe, n_prior, min_count),
            term_term)

    print('done calc_term-term_stats_from_termite.py')
    def test_enrichment(self, gene_ontology):
        features_of_interest = gene_ontology.all_genes[:10]
        test_enrichment_df = gene_ontology.enrichment(features_of_interest)

        p_value_cutoff = 1000000
        min_feature_size = 3
        min_background_size = 5
        cross_reference = {}
        domains = gene_ontology.domains
        background = gene_ontology.all_genes
        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in gene_ontology.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # TODO D.R.Y. this
            # Survival function is more accurate on small p-values
            log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes,
                                          len(background_in_go),
                                          n_features_of_interest)
            # p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['negative_log_p_value'][go_term] = -log_p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        # TODO D.R.Y. this
        # Bonferonni correction
        enrichment_df['bonferonni_corrected_negative_log_p_value'] = \
            enrichment_df['negative_log_p_value'] \
            - np.log(enrichment_df.shape[0])
        ind = enrichment_df['bonferonni_corrected_negative_log_p_value'
                            ] < np.log(p_value_cutoff)
        enrichment_df = enrichment_df.ix[ind]
        true_enrichment_df = enrichment_df.sort(
            columns=['negative_log_p_value'], ascending=False)

        pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
    def _one_fit(self):
        if self.verbose:
            print("\nCreating synthetic doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        if self.verbose:
            print("Normalizing...")
        if self.normalizer is not None:
            aug_counts = self.normalizer(
                sp_sparse.vstack((self._raw_counts, self._raw_synthetics)))
        else:
            # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts
            synth_lib_size = np.sum(self._raw_synthetics, axis=1).A1
            aug_lib_size = np.concatenate([self._lib_size, synth_lib_size])
            normed_synths = self._raw_synthetics.copy()
            inplace_csr_row_normalize_l1(normed_synths)
            aug_counts = sp_sparse.vstack(
                (self._normed_raw_counts, normed_synths))
            aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1)

        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        aug_counts = anndata.AnnData(aug_counts)
        aug_counts.obs["n_counts"] = aug_lib_size
        if self.standard_scaling is True:
            sc.pp.scale(aug_counts, max_value=15)

        if self.verbose:
            print("Running PCA...")
        sc.tl.pca(aug_counts,
                  n_comps=self.n_components,
                  random_state=self.random_state)
        if self.verbose:
            print("Clustering augmented data set...\n")
        if self.use_phenograph:
            f = io.StringIO()
            with redirect_stdout(f):
                fullcommunities, _, _ = phenograph.cluster(
                    aug_counts.obsm["X_pca"], **self.phenograph_parameters)
            out = f.getvalue()
            if self.verbose:
                print(out)
        else:
            sc.pp.neighbors(
                aug_counts,
                random_state=self.random_state,
                method="umap",
                n_neighbors=10,
            )
            sc.tl.louvain(aug_counts,
                          random_state=self.random_state,
                          resolution=4,
                          directed=False)
            fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        if self.verbose:
            print("Found clusters [{0}, ... {2}], with sizes: {1}\n".format(
                min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_log_p_values = {
            i: hypergeom.logsf(
                synth_cells_per_comm[i],
                aug_counts.shape[0],
                self._synthetics.shape[0],
                synth_cells_per_comm[i] + orig_cells_per_comm[i],
            )
            for i in community_IDs
        }
        log_p_values = np.array(
            [community_log_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            log_p_values[self.communities_ == -1] = np.nan

        return scores, log_p_values
    def test_enrichment(self, gene_ontology):
        features_of_interest = gene_ontology.all_genes[:10]
        test_enrichment_df = gene_ontology.enrichment(features_of_interest)

        p_value_cutoff = 1000000
        min_feature_size = 3
        min_background_size = 5
        cross_reference = {}
        domains = gene_ontology.domains
        background = gene_ontology.all_genes
        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in gene_ontology.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # TODO D.R.Y. this
            # Survival function is more accurate on small p-values
            log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes,
                                          len(background_in_go),
                                          n_features_of_interest)
            # p_value = 0 if p_value < 0 else p_value
            symbols = [
                cross_reference[f] if f in cross_reference else f
                for f in features_in_go
            ]
            enrichment['negative_log_p_value'][go_term] = -log_p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][go_term] = ','.join(
                features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        # TODO D.R.Y. this
        # Bonferonni correction
        enrichment_df['bonferonni_corrected_negative_log_p_value'] = \
            enrichment_df['negative_log_p_value'] \
            - np.log(enrichment_df.shape[0])
        ind = enrichment_df[
            'bonferonni_corrected_negative_log_p_value'] < np.log(
                p_value_cutoff)
        enrichment_df = enrichment_df.ix[ind]
        true_enrichment_df = enrichment_df.sort(
            columns=['negative_log_p_value'], ascending=False)

        pdt.assert_frame_equal(test_enrichment_df, true_enrichment_df)
def main(dictionaries, year, datestamp, min_score, universe, n_prior,
         min_count, association_statistic, reference_datamatrix_path,
         save_predictions):

    print('begin benchmark_term-term_stats_from_termite.py')

    print('dictionaries: {0}, {1}'.format(dictionaries[0], dictionaries[1]))
    print('year: {0}'.format(year))
    print('datestamp: {0}'.format(datestamp))
    print('min_score: {0!s}'.format(min_score))
    print('universe: {0}'.format(universe))
    print('n_prior: {0!s}'.format(n_prior))
    print('min_count: {0!s}'.format(min_count))
    print('association_statistic: {0}'.format(association_statistic))
    print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path))
    print('save_predictions: {0!s}'.format(save_predictions))

    # create figures folder
    print('creating figures folder...')
    figures_folder = 'benchmark_figures'
    if not os.path.exists(figures_folder):
        os.mkdir(figures_folder)

    # load counts datamatrix
    # this file is generated by count_term-term_pmids_from_termite.py
    print('loading counts datamatrix...')
    row_dictionary = dictionaries[
        0]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    column_dictionary = dictionaries[
        1]  # 'HUCELL', 'ANAT', 'INDICATION', 'HUCELLANAT', 'HUCELLANATINDICATION'
    counts_datamatrix_path = '{0}_{1}_datamatrix_pmidcounts_year_{2}_datestamp_{3}_minscore_{4!s}.pickle'.format(
        row_dictionary, column_dictionary, year, datestamp, min_score)
    term_term_counts_all = datasetIO.load_datamatrix(counts_datamatrix_path)
    print('counts_datamatrix_path: {0}'.format(counts_datamatrix_path))
    print(term_term_counts_all)

    # load association statistic datamatrix
    # this file is generated by calc_term-term_stats_from_termite.py
    print('loading association statistic datamatrix...')
    stats_datamatrix_path = '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}.pickle'.format(
        row_dictionary, column_dictionary, association_statistic, year,
        datestamp, min_score, universe, n_prior, min_count)
    term_term_stats_all = datasetIO.load_datamatrix(stats_datamatrix_path)
    print('stats_datamatrix_path: {0}'.format(stats_datamatrix_path))
    print(term_term_stats_all)

    # load reference datamatrix of positive and negative examples
    print('loading reference datamatrix of positive and negative examples...')
    term_term_ref = datasetIO.load_datamatrix(reference_datamatrix_path)
    print('reference_datamatrix_path: {0}'.format(reference_datamatrix_path))
    print(term_term_ref)

    # align datamatrices to reference
    print('aligning datamatrices to reference...')
    term_term_counts = term_term_counts_all.tolabels(
        rowlabels=term_term_ref.rowlabels.copy(),
        columnlabels=term_term_ref.columnlabels.copy())
    term_term_stats = term_term_stats_all.tolabels(
        rowlabels=term_term_ref.rowlabels.copy(),
        columnlabels=term_term_ref.columnlabels.copy())

    # find term-term pairs with sufficient counts
    print('finding term-term pairs with sufficient counts...')
    I, J = (term_term_counts.matrix >= min_count).nonzero()
    num_sufficient = I.size
    print('term-term pairs with at least {0!s} counts: {1!s}'.format(
        min_count, num_sufficient))

    # find row_term_dicts and column_term_dicts
    print('finding row_term_dicts and column_term_dicts')
    row_term_dicts = np.unique(term_term_stats.rowmeta['term_dict'])
    column_term_dicts = np.unique(term_term_stats.columnmeta['term_dict'])

    # calculate performance on reference examples and write to dataframe
    print(
        'calculating performance on reference examples and writing to dataframe...'
    )
    dataframe_path = 'benchmark_term-term_stats_dataframe.txt'
    metaheaders = [
        'row_dictionary', 'column_dictionary', 'year', 'datestamp',
        'min_score', 'universe', 'n_prior', 'min_count',
        'association_statistic', 'reference_datamatrix_path', 'row_term_dict',
        'column_term_dict'
    ]
    statheaders = [
        'tp', 'fn', 'tn', 'fp', 'ap', 'an', 'pp', 'pn', 'n', 'auroc', 'auprc',
        'tpr', 'fnr', 'tnr', 'fpr', 'ppv', 'fdr', 'npv', 'fomr', 'acc', 'mcr',
        'prev', 'plr', 'nlr', 'dor', 'drr', 'darr', 'mrr', 'marr', 'f1', 'mcc',
        'cos', 'fnlp', 'lrr', 'lrr_se', 'lrr_lb95', 'lrr_ub95', 'drr_lb95',
        'drr_ub95', 'lor', 'lor_se', 'lor_lb95', 'lor_ub95', 'dor_lb95',
        'dor_ub95', 'mi', 'nmi', 'iqr', 'min_value_association_statistic'
    ]
    with open(dataframe_path,
              mode='at',
              encoding='utf-8',
              errors='surrogateescape') as fw:
        writelist = metaheaders + statheaders
        fw.write('\t'.join(writelist) + '\n')
        for row_term_dict in row_term_dicts:
            row_hidxs = (term_term_stats.rowmeta['term_dict'] == row_term_dict
                         ).nonzero()[0]
            for column_term_dict in column_term_dicts:
                print('working on {0}-{1} associations...'.format(
                    row_term_dict, column_term_dict))

                # get scores and labels
                print('getting scores and labels...')
                column_hidxs = (term_term_stats.columnmeta['term_dict'] ==
                                column_term_dict).nonzero()[0]
                hit = np.logical_and(np.in1d(I, row_hidxs),
                                     np.in1d(J, column_hidxs))
                Y = term_term_ref.matrix[I[hit], J[hit]]
                X = (term_term_stats.matrix[I[hit], J[hit]]).reshape(-1, 1)
                X_prime = X.copy()
                if association_statistic == 'mcc':
                    X_prime = (X_prime + 1) / 2
                xpmin = (X_prime[X_prime > 0]).min() / 2
                xpmax = 1 - (1 - (X_prime[X_prime < 1]).max()) / 2
                X_prime[X_prime == 0] = xpmin
                X_prime[X_prime == 1] = xpmax
                logitX = np.log10(X_prime / (1 - X_prime))

                # save score histograms
                print('saving score histograms...')
                values = X.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, association_statistic, title, save_path, 'auto',
                    (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, association_statistic, title, save_path, 'auto',
                    (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                values = logitX.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'logit ' + association_statistic, title, save_path,
                    'auto', (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_LOGIT{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'logit ' + association_statistic, title, save_path,
                    'auto', (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                # fit logistic regression classifier
                print('fitting logistic regression classifier...')
                robust_scaler = RobustScaler().fit(logitX)
                Z = robust_scaler.transform(logitX)
                logistic_regression_model = LogisticRegression(
                    penalty='l2',
                    C=1e3,
                    intercept_scaling=1.0,
                    class_weight='balanced').fit(Z, Y)

                if logistic_regression_model.classes_[1] == 1:
                    decision_function = logistic_regression_model.decision_function(
                        Z)
                else:
                    decision_function = -logistic_regression_model.decision_function(
                        Z)
                Y_pred = decision_function > 0
                min_value_association_statistic = (X.reshape(-1)[Y_pred]).min()

                # save decision function and predicted probability histograms
                print(
                    'saving decision function and predicted probability histograms...'
                )
                values = decision_function.reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'decision fun ' + association_statistic, title,
                    save_path, 'auto', (values.min(), values.max()), False)
                save_path = '{0}/{1}_{2}_zoomhist_DF{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'decision fun ' + association_statistic, title,
                    save_path, 'auto', (values.min(), values.max()), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                values = (1 / (1 + np.exp(-decision_function))).reshape(-1)
                title = 'uv_{0}_rd{1}_cd{2}, pos:{3:1.3g}, neg:{4:1.3g}'.format(
                    universe[:5], row_term_dict[:5], column_term_dict[:5],
                    np.median(values[Y]), np.median(values[~Y]))
                save_path = '{0}/{1}_{2}_hist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'pred prob ' + association_statistic, title, save_path,
                    'auto', (0, 1), False)
                save_path = '{0}/{1}_{2}_zoomhist_PP{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                densities, edges = plot_step_density(
                    {
                        'positive': ('-r', values[Y]),
                        'negative': (':b', values[~Y])
                    }, 'pred prob ' + association_statistic, title, save_path,
                    'auto', (0, 1), False,
                    (np.percentile(values, 2.5), np.percentile(values, 97.5)))

                # compute roc and pr curves
                print('computing roc and pr curves...')
                fpr, tpr, thresholds = roc_curve(Y, decision_function)
                precision, recall, thresholds = precision_recall_curve(
                    Y, decision_function)

                auroc = roc_auc_score(Y, decision_function)
                auprc = average_precision_score(Y, decision_function)

                # save roc and pr curves
                print('saving roc and pr curves...')
                title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format(
                    universe[:5], association_statistic, row_term_dict[:5],
                    column_term_dict[:5], auprc)
                save_path = '{0}/{1}_{2}_prc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                fg, ax = plt.subplots(1, 1, figsize=(3, 2))
                ax.plot(recall, precision, '-k', linewidth=1)
                ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3,
                                 1.3 / 2])  # left, bottom, width, height
                ax.set_title(title, fontsize=8)
                ax.set_ylabel('Precision', fontsize=8, labelpad=4)
                ax.set_xlabel('Recall', fontsize=8, labelpad=2)
                ax.set_ylim((0, 1))
                ax.set_xlim((0, 1))
                ax.tick_params(axis='both',
                               which='major',
                               bottom=True,
                               top=False,
                               left=True,
                               right=False,
                               labelbottom=True,
                               labeltop=False,
                               labelleft=True,
                               labelright=False,
                               labelsize=8)
                ax.ticklabel_format(axis='both',
                                    style='sci',
                                    scilimits=(-3, 3),
                                    fontsize=8)
                ax.yaxis.offsetText.set_fontsize(8)
                ax.xaxis.offsetText.set_fontsize(8)
                fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300)
                plt.close()

                title = 'uv_{0}_as_{1}_rd{2}_cd{3}, auc:{4:1.3g}'.format(
                    universe[:5], association_statistic, row_term_dict[:5],
                    column_term_dict[:5], auroc)
                save_path = '{0}/{1}_{2}_roc_{3}_yr_{4}_ds_{5}_ms_{6!s}_uv_{7}_np_{8!s}_mc_{9!s}_rd_{10}_cd_{11}.png'.format(
                    figures_folder, row_dictionary, column_dictionary,
                    association_statistic, year, datestamp, min_score,
                    universe, n_prior, min_count, row_term_dict,
                    column_term_dict)
                fg, ax = plt.subplots(1, 1, figsize=(3, 2))
                ax.plot(fpr, tpr, '-k', linewidth=1)
                ax.set_position([0.55 / 3, 0.35 / 2, 2.1 / 3,
                                 1.3 / 2])  # left, bottom, width, height
                ax.set_title(title, fontsize=8)
                ax.set_ylabel('Precision', fontsize=8, labelpad=4)
                ax.set_xlabel('Recall', fontsize=8, labelpad=2)
                ax.set_ylim((0, 1))
                ax.set_xlim((0, 1))
                ax.tick_params(axis='both',
                               which='major',
                               bottom=True,
                               top=False,
                               left=True,
                               right=False,
                               labelbottom=True,
                               labeltop=False,
                               labelleft=True,
                               labelright=False,
                               labelsize=8)
                ax.ticklabel_format(axis='both',
                                    style='sci',
                                    scilimits=(-3, 3),
                                    fontsize=8)
                ax.yaxis.offsetText.set_fontsize(8)
                ax.xaxis.offsetText.set_fontsize(8)
                fg.savefig(save_path, transparent=True, pad_inches=0, dpi=300)
                plt.close()

                # save predictions for all term-term pairs
                if save_predictions:
                    print('saving predictions for all term-term pairs...')
                    predictions = {}
                    X_all = term_term_stats_all.matrix.reshape(-1, 1)
                    if association_statistic == 'mcc':
                        X_all = (X_all + 1) / 2
                    xamin = (X_all[X_all > 0]).min() / 2
                    xamax = 1 - (1 - (X_all[X_all < 1]).max()) / 2
                    X_all[X_all == 0] = xamin
                    X_all[X_all == 1] = xamax
                    logitX_all = np.log10(X_all / (1 - X_all))
                    Z_all = robust_scaler.transform(logitX_all)
                    if logistic_regression_model.classes_[1] == 1:
                        predictions[
                            'decision_function'] = logistic_regression_model.decision_function(
                                Z_all)
                    else:
                        predictions[
                            'decision_function'] = -logistic_regression_model.decision_function(
                                Z_all)
                    predictions['probability_positive'] = 1 / (
                        1 + np.exp(-predictions['decision_function']))
                    if not np.all(np.diff(thresholds) > 0):
                        raise ValueError('thresholds not increasing')
                    predictions['precision'] = np.interp(
                        predictions['decision_function'], thresholds,
                        precision[:-1])
                    predictions['recall'] = np.interp(
                        predictions['decision_function'], thresholds,
                        recall[:-1])
                    I0, J0 = (term_term_counts_all.matrix <
                              min_count).nonzero()
                    IA, JA = (term_term_counts_all.matrix >=
                              min_count).nonzero()
                    new_stats = [
                        '{0}_dictidname'.format(row_dictionary),
                        '{0}_dictidname'.format(column_dictionary)
                    ]
                    new_stat_mat = np.concatenate(
                        (term_term_counts_all.rowlabels[IA].reshape(-1, 1),
                         term_term_counts_all.columnlabels[JA].reshape(-1, 1)),
                        1)
                    for stat, values in predictions.items():
                        term_term_stats_all.matrix = values.reshape(
                            term_term_stats_all.shape[0],
                            term_term_stats_all.shape[1])
                        term_term_stats_all.matrix[I0, J0] = 0
                        datasetIO.save_datamatrix(
                            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.txt.gz'
                            .format(row_dictionary, column_dictionary, stat,
                                    year, datestamp, min_score, universe,
                                    n_prior, min_count, association_statistic,
                                    row_term_dict, column_term_dict),
                            term_term_stats_all)
                        datasetIO.save_datamatrix(
                            '{0}_{1}_datamatrix_{2}_yr_{3}_ds_{4}_ms_{5!s}_uv_{6}_np_{7!s}_mc_{8!s}_as_{9}_rd_{10}_cd_{11}.pickle'
                            .format(row_dictionary, column_dictionary, stat,
                                    year, datestamp, min_score, universe,
                                    n_prior, min_count, association_statistic,
                                    row_term_dict, column_term_dict),
                            term_term_stats_all)
                        new_stats.append(stat)
                        new_stat_mat = np.append(
                            new_stat_mat,
                            (term_term_stats_all.matrix[IA,
                                                        JA]).reshape(-1, 1), 1)
                    new_df = pd.DataFrame(data=new_stat_mat, columns=new_stats)
                    dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}.txt.gz'.format(
                        row_dictionary, column_dictionary, year, datestamp,
                        min_score, universe, n_prior, min_count)
                    joined_dataframe_path = '{0}_{1}_dataframe_yr_{2}_ds_{3}_ms_{4!s}_uv_{5}_np_{6!s}_mc_{7!s}_as_{8}_rd_{9}_cd_{10}.txt.gz'.format(
                        row_dictionary, column_dictionary, year, datestamp,
                        min_score, universe, n_prior, min_count,
                        association_statistic, row_term_dict, column_term_dict)
                    df = pd.read_table(dataframe_path,
                                       compression='gzip',
                                       index_col=False)
                    joined_df = df.set_index(new_stats[:2]).join(
                        new_df.set_index(new_stats[:2]))
                    joined_df.sort_values(by=association_statistic,
                                          ascending=False,
                                          inplace=True)
                    joined_df.to_csv(joined_dataframe_path,
                                     sep='\t',
                                     compression='gzip')

                # compute classifier performance statistics
                # note, these are in-sample statistics
                # we are not worried about overfitting
                # because we only have one feature
                # and we are not trying to build a rigorous ML model
                # we are simply trying to answer the question,
                # given a reference set of positive and negative examples,
                # which association statistic ranks term-term pairs the best?
                print('computing classifier performance statistics...')
                tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()

                # incorporate a random prior with effective sample size = n_prior
                prevalence = (tp + fn) / (tn + fp + fn + tp)
                tp += n_prior * prevalence / 2
                fn += n_prior * prevalence / 2
                tn += n_prior * (1 - prevalence) / 2
                fp += n_prior * (1 - prevalence) / 2

                ap = tp + fn
                an = fp + tn
                pp = tp + fp
                pn = tn + fn
                n = tn + fp + fn + tp

                tpr = tp / ap  # sensitivity, recall
                fnr = fn / ap  # 1-tpr, 1-sensitivity, 1-recall
                tnr = tn / an  # specificity
                fpr = fp / an  # 1-tnr, 1-specificity

                ppv = tp / pp  # precision
                fdr = fp / pp  # 1-ppv, 1-precision
                npv = tn / pn
                fomr = fn / pn  # 1-npv

                acc = (tp + tn) / n
                mcr = (fp + fn) / n  # 1-acc
                prev = ap / n

                plr = (tp / fp) / (
                    ap / an
                )  # tpr/fpr, sensitivity/(1-specificity), ratio of positives to negatives in positive predictions relative to ratio in whole sample, higher is better
                nlr = (fn / tn) / (
                    ap / an
                )  # fnr/tnr, (1-sensitivity)/specificity, ratio of positives to negatives in negative predictions relative to ratio in whole sample, lower is better
                dor = (tp / fp) / (
                    fn / tn
                )  # plr/nlr, ratio of positives to negatives in positive predictions, divided by ratio of positives to negatives in negative predictions
                drr = (tp / pp) / (
                    fn / pn
                )  # ppv/fomr, relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in negative predictions
                darr = (tp / pp) - (
                    fn / pn
                )  # ppv - fomr, absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in negative predictions
                mrr = (tp / pp) / (
                    ap / n
                )  # ppv/prev, modified (by me) relative risk or risk ratio, fraction of positives in positive predictions divided by fraction of positives in whole sample
                marr = (tp / pp) - (
                    ap / n
                )  # ppv - prev, modified (by me) absolute risk reduction, fraction of positives in positive predictions minus fraction of positives in whole sample

                f1 = (1 + (1**2)) * ppv * tpr / ((1**2) * ppv + tpr)
                mcc = (tp * tn - fp * fn) / np.sqrt(
                    (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
                cos = tp / np.sqrt((tp + fp) * (tp + fn))  # ochiai
                fnlp = -hypergeom.logsf(tp, n, ap, pp, loc=1) / np.log(10)

                lrr = np.log10(tp) - np.log10(tp + fp) - np.log10(
                    fn) + np.log10(fn + tn)  # log10 of relative risk
                lrr_se = np.sqrt(
                    fp / tp / (tp + fp) + tn / fn / (fn + tn)) / np.log(
                        10)  # standard error of log10 of relative risk
                lrr_lb95 = lrr - 1.96 * lrr_se
                lrr_ub95 = lrr + 1.96 * lrr_se
                drr_lb95 = 10**lrr_lb95
                drr_ub95 = 10**lrr_ub95

                lor = np.log10(tp) - np.log10(fp) - np.log10(fn) + np.log10(
                    tn)  # log10 of odds ratio
                lor_se = np.sqrt(1 / tp + 1 / fp + 1 / fn + 1 / tn) / np.log(
                    10)  # standard error of log10 of odds ratio
                lor_lb95 = lor - 1.96 * lor_se
                lor_ub95 = lor + 1.96 * lor_se
                dor_lb95 = 10**lor_lb95
                dor_ub95 = 10**lor_ub95

                mi, nmi, iqr = mutualinformation(
                    tp, fp, fn, tn
                )  # mutual information, normalized mutual information, information quality ratio

                # write to dataframe
                print('writing to dataframe...')
                count_stats = [tp, fn, tn, fp, ap, an, pp, pn, n]
                other_stats = [
                    auroc, auprc, tpr, fnr, tnr, fpr, ppv, fdr, npv, fomr, acc,
                    mcr, prev, plr, nlr, dor, drr, darr, mrr, marr, f1, mcc,
                    cos, fnlp, lrr, lrr_se, lrr_lb95, lrr_ub95, drr_lb95,
                    drr_ub95, lor, lor_se, lor_lb95, lor_ub95, dor_lb95,
                    dor_ub95, mi, nmi, iqr, min_value_association_statistic
                ]

                writelist = [
                    row_dictionary, column_dictionary, year, datestamp,
                    str(min_score), universe,
                    str(n_prior),
                    str(min_count), association_statistic,
                    reference_datamatrix_path, row_term_dict, column_term_dict
                ]
                writelist += [str(s) for s in count_stats]
                writelist += ['{0:1.5g}'.format(s) for s in other_stats]
                fw.write('\t'.join(writelist) + '\n')

    print('done benchmark_term-term_stats_from_termite.py')
    def enrichment(self, features_of_interest, background=None,
                   p_value_cutoff=1000000, cross_reference=None,
                   min_feature_size=3, min_background_size=5,
                   domain=None):
        """Bonferroni-corrected hypergeometric p-values of GO enrichment

        Calculates hypergeometric enrichment of the features of interest,
        in each GO category.

        Parameters
        ----------
        features_of_interest : list-like
            List of features. Must match the identifiers in the ontology
            database exactly, i.e. if your ontology database is ENSEMBL ids,
            then you can only provide those and not common names like "RBFOX2"
        background : list-like, optional
            Background genes to use. It is best to use a relevant background
            such as all expressed genes. If None, defaults to all genes.
        p_value_cutoff : float, optional
            Maximum accepted Bonferroni-corrected p-value
        cross_reference : dict-like, optional
            A mapping of gene ids to gene symbols, e.g. a pandas Series of
            ENSEMBL genes e.g. ENSG00000139675 to gene symbols e.g HNRNPA1L2
        min_feature_size : int, optional
            Minimum number of features of interest overlapping in a GO Term,
            to calculate enrichment
        min_background_size : int, optional
            Minimum number of features in the background overlapping a GO Term
        domain : str or list, optional
            Only calculate GO enrichment for a particular GO category or
            subset of categories. Valid domains:
            'biological_process', 'molecular_function', 'cellular_component'

        Returns
        -------
        enrichment_df : pandas.DataFrame
            A (n_go_categories, columns) DataFrame of the enrichment scores

        Raises
        ------
        ValueError
            If features of interest and background do not overlap,
            or invalid GO domains are given
        """
        cross_reference = {} if cross_reference is None else cross_reference
        background = self.all_genes if background is None else background
        if len(set(background) & set(features_of_interest)) == 0:
            raise ValueError('Features of interest and background do not '
                             'overlap! Not calculating GO enrichment')
        if len(set(features_of_interest) & set(self.all_genes)) == 0:
            raise ValueError('Features of interest do not overlap with GO term'
                             'gene ids. Not calculating GO enrichment.')
        domains = self.domains
        valid_domains = ",".join("'{}'".format(x) for x in self.domains)

        # TODO more elegant type check
        if isinstance(domain, str) or isinstance(domain, basestring):
            if domain not in self.domains:
                raise ValueError(
                    "'{}' is not a valid GO domain. "
                    "Only {} are acceptable".format(domain, valid_domains))
            domains = frozenset([domain])
        elif isinstance(domain, Iterable):
            if len(set(domain) & self.domains) == 0:
                raise ValueError(
                    "'{}' are not a valid GO domains. "
                    "Only {} are acceptable".format(
                        ",".join("'{}'".format(x) for x in domain),
                        valid_domains))
            domains = frozenset(domain)

        n_all_genes = len(background)
        n_features_of_interest = len(features_of_interest)
        enrichment = defaultdict(dict)

        for go_term, go_genes in self.ontology.items():
            if go_genes['domain'] not in domains:
                continue

            features_in_go = go_genes['genes'].intersection(
                features_of_interest)
            background_in_go = go_genes['genes'].intersection(background)
            too_few_features = len(features_in_go) < min_feature_size
            too_few_background = len(background_in_go) < min_background_size
            if too_few_features or too_few_background:
                continue

            # Survival function is more accurate on small p-values
            log_p_value = hypergeom.logsf(len(features_in_go), n_all_genes,
                                          len(background_in_go),
                                          n_features_of_interest)
            # p_value = 0 if p_value < 0 else p_value
            symbols = [cross_reference[f] if f in cross_reference else f for f
                       in features_in_go]
            enrichment['negative_log_p_value'][go_term] = -log_p_value
            enrichment['n_features_of_interest_in_go_term'][go_term] = len(
                features_in_go)
            enrichment['n_background_in_go_term'][go_term] = len(
                background_in_go)
            enrichment['n_features_total_in_go_term'][go_term] = len(
                go_genes['genes'])
            enrichment['features_of_interest_in_go_term'][
                go_term] = ','.join(features_in_go)
            enrichment['features_of_interest_in_go_term_gene_symbols'][
                go_term] = ','.join(symbols)
            enrichment['go_domain'][go_term] = go_genes['domain']
            enrichment['go_name'][go_term] = go_genes['name']
        enrichment_df = pd.DataFrame(enrichment)

        if enrichment_df.empty:
            warnings.warn('No GO categories enriched in provided features')
            return

        # Bonferonni correction
        enrichment_df['bonferonni_corrected_negative_log_p_value'] = \
            enrichment_df['negative_log_p_value'] \
            - np.log(enrichment_df.shape[0])
        ind = enrichment_df['bonferonni_corrected_negative_log_p_value'
                            ] < np.log(p_value_cutoff)
        enrichment_df = enrichment_df.ix[ind]
        enrichment_df = enrichment_df.sort(
            columns=['negative_log_p_value'], ascending=False)

        return enrichment_df
Exemple #14
0
def enrichment_significance(term_row):
    return LOG10_FACTOR * hypergeom.logsf(term_row['hit_count']-1, term_row['universe'],
                                          term_row['term_count'], term_row['list_size'])
    def _one_fit(self):
        print("\nCreating synthetic doublets...")
        self._createDoublets()

        # Normalize combined augmented set
        print("Normalizing...")
        if self.normalizer is not None:
            aug_counts = self.normalizer(
                np.append(self._raw_counts, self._raw_synthetics, axis=0))
        else:
            # Follows doubletdetection.plot.normalize_counts, but uses memoized normed raw_counts
            synth_lib_size = np.sum(self._raw_synthetics, axis=1)
            aug_lib_size = np.concatenate([self._lib_size, synth_lib_size])
            normed_synths = self._raw_synthetics / synth_lib_size[:,
                                                                  np.newaxis]
            aug_counts = np.concatenate(
                [self._normed_raw_counts, normed_synths], axis=0)
            aug_counts = np.log(aug_counts * np.median(aug_lib_size) + 0.1)

        self._norm_counts = aug_counts[:self._num_cells]
        self._synthetics = aug_counts[self._num_cells:]

        print("Running PCA...")
        # Get phenograph results
        pca = PCA(n_components=self.n_components,
                  random_state=self.random_state)
        reduced_counts = pca.fit_transform(aug_counts)
        print("Clustering augmented data set with Phenograph...\n")
        fullcommunities, _, _ = phenograph.cluster(
            reduced_counts, **self.phenograph_parameters)
        min_ID = min(fullcommunities)
        self.communities_ = fullcommunities[:self._num_cells]
        self.synth_communities_ = fullcommunities[self._num_cells:]
        community_sizes = [
            np.count_nonzero(fullcommunities == i)
            for i in np.unique(fullcommunities)
        ]
        print("Found communities [{0}, ... {2}], with sizes: {1}\n".format(
            min(fullcommunities), community_sizes, max(fullcommunities)))

        # Count number of fake doublets in each community and assign score
        # Number of synth/orig cells in each cluster.
        synth_cells_per_comm = collections.Counter(self.synth_communities_)
        orig_cells_per_comm = collections.Counter(self.communities_)
        community_IDs = orig_cells_per_comm.keys()
        community_scores = {
            i: float(synth_cells_per_comm[i]) /
            (synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        scores = np.array([community_scores[i] for i in self.communities_])

        community_log_p_values = {
            i:
            hypergeom.logsf(synth_cells_per_comm[i], aug_counts.shape[0],
                            self._synthetics.shape[0],
                            synth_cells_per_comm[i] + orig_cells_per_comm[i])
            for i in community_IDs
        }
        log_p_values = np.array(
            [community_log_p_values[i] for i in self.communities_])

        if min_ID < 0:
            scores[self.communities_ == -1] = np.nan
            log_p_values[self.communities_ == -1] = np.nan

        return scores, log_p_values