コード例 #1
0
ファイル: GO.py プロジェクト: wangdi2014/cgat
def pairwiseGOEnrichment(results_per_genelist, labels, test_ontology, go2info,
                         options):
    '''compute pairwise enrichment between sets.

    The purpose of this method is to find if there are categories that are differently enriched
    in a pair of gene lists.

    The appropriate test here is the Chi-Squared test.

    The assumption is that the background set is the same in all gene lists.

    The workflow is thus::

       for each combination of two gene lists:
           for each GO category:
               get counts in foreground, total counts of foreground
               compute chi-square enrichment output
               save P-value
           apply fdr - output significant differences.
    '''

    dicts = [dict(x) for x in results_per_genelist]

    PairResult = collections.namedtuple(
        "PairResult",
        "goid set1 set2 counts1 total1 pvalue1 qvalue1 counts2 total2 pvalue2 qvalue2 pvalue qvalue description"
    )

    outfile = getFileName(options,
                          go=test_ontology,
                          section='summary',
                          set="pairs")

    outfile.write(
        "set1\tset2\ttotal1\ttotal2\tshared\tskipped\ttested\tsignificant\tinsignificant\n"
    )

    results = []

    total = len(dicts) * (len(dicts) - 1) / 2

    iteration = 0

    min_observed_counts = options.pairs_min_observed_counts

    for x, genelist1 in enumerate(sorted(dicts)):

        x_go_categories = set(genelist1.keys())
        for y, genelist2 in enumerate(sorted(dicts[:x])):

            iteration += 1
            if iteration % 10 == 0:
                E.info("iteration: %i/%i (%5.2f%%)" %
                       (iteration, total, 100.0 * iteration / total))

            y_go_categories = set(genelist2.keys())

            shared = x_go_categories.intersection(y_go_categories)

            c = E.Counter()

            for category in shared:
                c.shared += 1
                xx = genelist1[category]
                yy = genelist2[category]

                # discard all tests with few observations in the observed
                # counts
                if xx.mSampleCountsCategory < min_observed_counts and yy.mSampleCountsCategory < min_observed_counts:
                    c.skipped += 1
                    continue

                observed = (xx.mSampleCountsCategory, yy.mSampleCountsCategory)

                aa, bb, cc, dd = \
                    (xx.mSampleCountsCategory,
                     yy.mSampleCountsCategory,
                     xx.mSampleCountsTotal - xx.mSampleCountsCategory,
                     yy.mSampleCountsTotal - yy.mSampleCountsCategory)

                if cc == dd == 0:
                    c.skipped += 1
                    continue

                c.tested += 1

                fisher, pvalue = scipy.stats.fisher_exact(
                    numpy.array(((aa, bb), (cc, dd))))

                if pvalue < 0.05:
                    c.significant_pvalue += 1
                else:
                    c.insignificant_pvalue += 1

                results.append(
                    PairResult._make(
                        (category, labels[x], labels[y],
                         xx.mSampleCountsCategory, xx.mSampleCountsTotal,
                         xx.mPValue, xx.mQValue, yy.mSampleCountsCategory,
                         yy.mSampleCountsTotal, yy.mPValue, yy.mQValue, pvalue,
                         1.0, go2info[category].mDescription)))

            outfile.write("\t".join(
                map(str, (labels[x], labels[y], len(x_go_categories),
                          len(y_go_categories), c.shared, c.skipped, c.tested,
                          c.significant_pvalue, c.insignicant_pvalue))) + "\n")
    if options.output_filename_pattern:
        outfile.close()

    if options.fdr:
        pvalues = [x.pvalue for x in results]

        if options.qvalue_method == "storey":

            # compute fdr via Storey's method
            try:
                fdr_data = Stats.doFDR(pvalues)

            except ValueError as msg:
                E.warn("failure in q-value computation: %s" % msg)
                E.warn("reverting to Bonferroni correction")
                method = "bonf"
                fdr_data = Stats.FDRResult()
                l = float(len(pvalues))
                fdr_data.mQValues = [min(1.0, x * l) for x in pvalues]

            qvalues = fdr_data.mQValues
        else:
            qvalues = R['p.adjust'](pvalues, method=options.qvalue_method)

        # update qvalues
        results = [x._replace(qvalue=y) for x, y in zip(results, qvalues)]

    outfile = getFileName(options,
                          go=test_ontology,
                          section='pairs',
                          set="pairs")

    outfile.write("\t".join(PairResult._fields) + "\n")
    for result in results:
        outfile.write("\t".join(map(str, result)) + "\n")

    if options.output_filename_pattern:
        outfile.close()
コード例 #2
0
ファイル: GO.py プロジェクト: wangdi2014/cgat
def computeFDRs(go_results, foreground, background, options, test_ontology,
                gene2go, go2info):

    pairs = sorted(go_results.mResults.items())

    E.info("calculating the FDRs using method `%s`" % options.qvalue_method)

    samples = None

    observed_min_pvalues = [
        min(x[1].mProbabilityOverRepresentation,
            x[1].mProbabilityUnderRepresentation) for x in pairs
    ]

    fdrs = {}

    method = options.qvalue_method

    if options.qvalue_method == "storey":

        # compute fdr via Storey's method
        try:
            fdr_data = Stats.doFDR(observed_min_pvalues)

        except ValueError as msg:
            E.warn("failure in q-value computation: %s" % msg)
            E.warn("reverting to Bonferroni correction")
            method = "bonf"
            fdr_data = Stats.FDRResult()
            l = float(len(observed_min_pvalues))
            fdr_data.mQValues = [min(1.0, x * l) for x in observed_min_pvalues]

        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    elif options.qvalue_method == "empirical":
        assert options.sample > 0, "requiring a sample size of > 0"

        #######################################################################
        # sampling
        # for each GO-category:
        # get maximum and minimum counts in x samples -> calculate minimum/maximum significance
        # get average and stdev counts in x samples -> calculate z-scores for
        # test set
        samples, simulation_min_pvalues = getSamples(gene2go, foreground,
                                                     background, options,
                                                     test_ontology, go2info)

        # compute P-values from sampling
        observed_min_pvalues.sort()
        observed_min_pvalues = numpy.array(observed_min_pvalues)

        sample_size = options.sample

        for k, v in pairs:

            if k in samples:
                s = samples[k]
            else:
                raise KeyError("category %s not in samples" % k)

            # calculate values for z-score
            if s.mStddev > 0:
                zscore = abs(float(v.mSampleCountsCategory) -
                             s.mMean) / s.mStddev
            else:
                zscore = 0.0

            #############################################################
            # FDR:
            # For each p-Value p at node n:
            #   a = average number of nodes in each simulation run with P-Value < p
            #           this can be obtained from the array of all p-values and all nodes
            #           simply divided by the number of samples.
            #      aka: expfpos=experimental false positive rate
            #   b = number of nodes in observed data, that have a P-Value of less than p.
            #      aka: pos=positives in observed data
            #   fdr = a/b
            pvalue = v.mPValue

            # calculate values for FDR:
            # nfdr = number of entries with P-Value better than node.
            a = 0
            while a < len(simulation_min_pvalues) and \
                    simulation_min_pvalues[a] < pvalue:
                a += 1
            a = float(a) / float(sample_size)
            b = 0
            while b < len(observed_min_pvalues) and \
                    observed_min_pvalues[b] < pvalue:
                b += 1

            if b > 0:
                fdr = min(1.0, float(a) / float(b))
            else:
                fdr = 1.0

            fdrs[k] = (fdr, a, b)
    else:
        qvalues = R['p.adjust'](observed_min_pvalues,
                                method=options.qvalue_method)
        fdr_data = Stats.FDRResult()
        fdr_data.mQValues = list(qvalues)
        for pair, qvalue in zip(pairs, fdr_data.mQValues):
            fdrs[pair[0]] = (qvalue, 1.0, 1.0)

    return fdrs, samples, method