Exemple #1
0
    study, pop = read_geneset(study_fn, pop_fn, compare=args.compare)
    assoc = read_associations(assoc_fn)

    methods = ["bonferroni", "sidak", "holm"]
    if args.fdr:
        methods.append("fdr")

    starttime = time.clock()
    # obo_dag = GODag(obo_file=args.obo)
    obo_dag = read_data()
    endtime = time.clock()
    f = open('E:/time.txt', 'w')
    f.write(str(endtime - starttime))
    f.close()
    print(str(endtime - starttime))

    # save_data(obo_dag)
    # obo_dag=read_data()
    g = GOEnrichmentStudy(pop,
                          assoc,
                          obo_dag,
                          alpha=args.alpha,
                          study=study,
                          methods=methods)
    endtime = time.clock()
    f = open('E:/time1.txt', 'w')
    f.write(str(endtime - starttime))
    f.close()
    # print (endtime - starttime)
    g.print_summary(min_ratio=min_ratio, indent=args.indent, pval=args.pval)
Exemple #2
0
    p.add_option('--fdr', dest='fdr', default=False,
                action='store_true',
                help="calculate the false discovery rate (alternative to the Bonferroni correction)")
    p.add_option('--indent', dest='indent', default=False,
                action='store_true', help="indent GO terms")

    (opts, args) = p.parse_args()
    bad = check_bad_args(args)
    if bad:
        print bad
        sys.exit(p.print_help())

    alpha = float(opts.alpha) if opts.alpha else 0.05

    min_ratio = opts.ratio
    if not min_ratio is None:
        assert 1 <= min_ratio <= 2

    study_fn, pop_fn, assoc_fn = args
    study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare)
    assoc = read_associations(assoc_fn)

    methods=["bonferroni", "sidak", "holm"]
    if opts.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file="gene_ontology.1_2.obo")
    g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=alpha, study=study, methods=methods)
    g.print_summary(min_ratio=min_ratio, indent=opts.indent)

                 action='store_true',
                 help="Calculate the false discovery rate (alt. to the "
                 "Bonferroni but slower)")
    p.add_option('--indent', dest='indent', default=False,
                 action='store_true', help="indent GO terms")

    (opts, args) = p.parse_args()
    bad = check_bad_args(args)
    if bad:
        print bad
        sys.exit(p.print_help())

    min_ratio = opts.ratio
    if min_ratio is not None:
        assert 1 <= min_ratio <= 2

    assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)"

    study_fn, pop_fn, assoc_fn = args
    study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare)
    assoc = read_associations(assoc_fn)

    methods = ["bonferroni", "sidak", "holm"]
    if opts.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file="gene_ontology.1_2.obo")
    g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=opts.alpha,
                          study=study, methods=methods)
    g.print_summary(min_ratio=min_ratio, indent=opts.indent, pval=opts.pval)
Exemple #4
0
    study, pop = read_geneset(study_fn, pop_fn, compare=args.compare)
    print("Study: {0} vs. Population {1}".format(len(study), len(pop)), file=sys.stderr)

    if not args.compare:  # sanity check
        if len(pop) < len(study):
            exit("\nERROR: The study file contains more elements than the population file. "
                 "Please check that the study file is a subset of the population file.\n")
        # check the fraction of genomic ids that overlap between study
        # and population
        overlap = float(len(study & pop)) / len(study)
        if 0.7 < overlap < 0.95:
            sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in "
                             "the population  background.\n\n".format(overlap))
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = ["bonferroni", "sidak", "holm"]
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          study=study, methods=methods)
    g.print_summary(min_ratio=min_ratio, indent=args.indent, pval=args.pval)
Exemple #5
0
def FindEnrichment(obo, ontology, gi_info, gis):
    # Parses the list of gis into lists of gi groups, and creates a mapping from
    # gi to group.
    groups = [[int(gi) for gi in group.split(',')] for group in gis.split('|')]
    gi_to_group = {gi: i for i, group in enumerate(groups) for gi in group}
    # Creates a gi to index map for sorting purposes.
    gis_index = {
        g: i
        for i, g in enumerate(gi for group in groups for gi in group)
    }
    # Creates a gi to gene id mapping.
    gis = gis_index.keys()
    gi_to_gene = {
        gi: d['gene_id']
        for d in gi_info.find({'gis': {
            '$in': gis
        }}) if 'gene_id' in d and d['gene_id'] for gi in d['gis']
    }
    # Creatss a gene to gis list mapping.
    gene_to_gis = defaultdict(set)
    for gi, gene in gi_to_gene.iteritems():
        gene_to_gis[gene].add(gi)
    genes = set(gi_to_gene.itervalues())

    # Parameters for goatools:

    # Test-wise alpha for multiple testing
    alpha = 0.05
    # Family-wise alpha (whole experiment), only print out Bonferroni p-value is
    # less than this value.
    pval = 0.05
    # the population file as a comparison group. if this flag is specified, the
    # population is used as the study plus the `population/comparison`
    compare = False
    # only show values where the difference between study and population ratios is
    # greater than this. useful for excluding GO categories with small differences
    # but containing large numbers of genes. should be a value between 1 and 2.
    min_ratio = None
    # Calculates the false discovery rate (alt. to the Bonferroni but slower)
    fdr = False

    # Modifies the associations dictionary to become consistent with the actual term
    # ids and gene ids.
    associations = {
        int(k): set('GO:%s' % str(go).zfill(7) for go in v)
        for k, v in ontology['associations'].iteritems()
    }
    population = set(ontology['population'])
    study = genes

    methods = ['bonferroni']  # Other methods: sidak, holm, fdr
    obo_dag = GODag(obo_file=obo)

    # Performs the enrichment analysis
    g = GOEnrichmentStudy(population,
                          associations,
                          obo_dag,
                          alpha=alpha,
                          study=study,
                          methods=methods)

    # Creates a mapping from GO term to gene ids. This is done after the analysis
    # since the analysis modifies the associations dictionary to include parent
    # terms.
    reverse_associations = defaultdict(set)
    for k, v in associations.iteritems():
        for go in v:
            reverse_associations[go].add(k)

    # Inserts each record into the final json array of results.
    results = []
    for record in g.results:
        # This is done by default in goatools when print to standard output.
        record.update_remaining_fields(min_ratio=min_ratio)
        if record.p_bonferroni > pval or not record.is_ratio_different:
            continue
        # Only returns enriched records.
        if record.enrichment != 'e':
            continue
        result = {}
        result['id'] = record.id
        level = obo_dag[result['id']].level
        # Filteres by GO term depth to avoid GO terms that are too general.
        if level < 2 and level >= 0:
            continue
        # FIlls in remaining fields.
        result['term'] = record.description
        study_ratio = _EvaluateFraction(record.ratio_in_study)
        population_ratio = _EvaluateFraction(record.ratio_in_pop)
        result['study_ratio'] = '%.4f' % study_ratio
        result['population_ratio'] = '%.4f' % population_ratio
        result['fold'] = study_ratio / population_ratio
        result['pval'] = '%.3g' % record.p_bonferroni

        # Uses the reverse associations dictionary to retrieve the list of gis that
        # matched the term.
        matched_genes = reverse_associations[result['id']] & genes
        matched_gis = set(gi for gene in matched_genes
                          for gi in gene_to_gis[gene])
        matched_gis &= set(gis)
        matched_gis = list(matched_gis)
        matched_gis.sort(key=lambda x: gis_index[x], reverse=True)
        result['all_genes'] = matched_gis
        # A representative set of genes is also returned in which only one gi per
        # group is returned.
        matched_groups = {gi_to_group[gi]: gi for gi in matched_gis}
        matched_gis = matched_groups.values()
        matched_gis.sort(key=lambda x: gis_index[x])
        result['genes'] = matched_gis
        results.append(result)

    # Results are sorted based on how well the fold change between the study and
    # popualtion ratios compares to other results, and how the number of
    # representative genes compares to other results.
    folds = sorted([r['fold'] for r in results])
    lengths = sorted([len(r['genes']) for r in results])
    results.sort(
        key=lambda r:
        (folds.index(r['fold']) + lengths.index(len(r['genes']))) / 2.0,
        reverse=True)
    return {'results': results}
Exemple #6
0
            '$in': list(forward_loci)
        }}, {'_id': True})
    ]


# One of Ana's Sample
study_indexDB = '/home/gstupp/01_2015_mass_spec/H1_11082014/1108_Gly1_2014_12_15_15_29205/dtaselect_results_sfp0.01_p2/DTASelect-filter.txt'
study_ps = build_proteins_from_peptides.main(study_indexDB)
study_loci = set(chain(*[x['forward_loci'] for x in study_ps]))
study = setup_study_pop(study_loci)

# One of Sandip's microbiome samples
pop_indexDB = '/home/gstupp/01_2015_mass_spec/120314_SC_sampleH1sol_HCD35/DTASelect-filter.txt'
pop_ps = build_proteins_from_peptides.main(pop_indexDB)
pop_loci = set(chain(*[x['forward_loci'] for x in pop_ps]))
pop = setup_study_pop(study_loci and pop_loci)

# set up hash -> GO matching
assoc = setup_association(study_loci and pop_loci)

obo_dag = GODag(obo_file=os.path.expanduser("~/go/go-basic.obo"))

study_sub = study[:1000]
g = GOEnrichmentStudy(pop,
                      assoc,
                      obo_dag,
                      alpha=0.05,
                      study=study,
                      methods=["fdr"])
g.print_summary(min_ratio=None, indent=False, pval=None)
Exemple #7
0
def check_enrichment(study_fn,
                     pop_fn,
                     assoc_fn,
                     print_summary=False,
                     save_summary=True,
                     savepath=None,
                     obo_dag=None):
    p = optparse.OptionParser(__doc__)

    p.add_option('--alpha',
                 default=0.05,
                 type="float",
                 help="Test-wise alpha for multiple testing "
                 "[default: %default]")
    p.add_option('--pval',
                 default=None,
                 type="float",
                 help="Family-wise alpha (whole experiment), only print out "
                 "Bonferroni p-value is less than this value. "
                 "[default: %default]")
    p.add_option('--compare',
                 dest='compare',
                 default=False,
                 action='store_true',
                 help="the population file as a comparison group. if this "
                 "flag is specified, the population is used as the study "
                 "plus the `population/comparison`")
    p.add_option('--ratio',
                 dest='ratio',
                 type='float',
                 default=None,
                 help="only show values where the difference between study "
                 "and population ratios is greater than this. useful for "
                 "excluding GO categories with small differences, but "
                 "containing large numbers of genes. should be a value "
                 "between 1 and 2. ")
    p.add_option('--fdr',
                 dest='fdr',
                 default=False,
                 action='store_true',
                 help="Calculate the false discovery rate (alt. to the "
                 "Bonferroni but slower)")
    p.add_option('--indent',
                 dest='indent',
                 default=False,
                 action='store_true',
                 help="indent GO terms")

    (opts, args) = p.parse_args()
    args = [study_fn, pop_fn, assoc_fn]
    bad = check_bad_args(args)
    if bad:
        print(bad)
        sys.exit(p.print_help())

    min_ratio = opts.ratio
    if min_ratio is not None:
        assert 1 <= min_ratio <= 2

    assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)"

    study_fn, pop_fn, assoc_fn = args
    study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare)
    assoc = read_associations(assoc_fn)
    methods = ["bonferroni", "sidak", "holm"]
    if opts.fdr:
        methods.append("fdr")
    if obo_dag is None:
        obo_file = "go-basic.obo"
        obo_dag = GODag(obo_file=obo_file)
    g = GOEnrichmentStudy(pop,
                          assoc,
                          obo_dag,
                          alpha=opts.alpha,
                          methods=methods)

    results = g.run_study(study)

    if print_summary:
        g.print_summary(results,
                        min_ratio=min_ratio,
                        indent=opts.indent,
                        pval=opts.pval)

    if save_summary:
        if savepath is None:
            savepath = study_fn.replace(
                study_fn.split("/")[-1],
                "enrichment_" + study_fn.split("/")[-1])
        g.wr_tsv(savepath, results)
Exemple #8
0
def term_enrichment(pop_genes,
                    gene_sets,
                    obo_path,
                    assoc_path,
                    folder,
                    condition,
                    regenerate=False,
                    test_sig=True,
                    **kwargs):
    kwargs.setdefault('alpha', 0.05)
    kwargs.setdefault('methods', ["bonferroni", "sidak", "holm"])

    # Setup goatools enrichment
    if regenerate:
        assoc = read_associations(assoc_path)
        go_dag = GODag(obo_file=obo_path)
        pop = set(pop_genes)
        g = GOEnrichmentStudy(pop, assoc, go_dag, **kwargs)

    # go_enrich = OrderedDict()
    go_enrich = OrderedDict()

    for gc, genes in gene_sets.items():
        # Write the gene list to a file
        out_path = '{}/go_enrich/{}_{}_list.txt'.format(folder, condition, gc)

        write_gene_list(genes, out_path)
        enrich_path = out_path.replace('list', 'enrich')
        try:
            if regenerate:
                raise ValueError('Override to retrain')
            enrich = pd.read_csv(enrich_path, sep='\t', index_col=0)
        except (FileNotFoundError, ValueError) as e:
            r = g.run_study(frozenset(genes))
            g.wr_tsv(enrich_path, r)
            enrich = pd.read_csv(enrich_path, sep='\t', index_col=0)
        enrich = enrich[(enrich.p_bonferroni < kwargs['alpha'])]
        go_enrich[gc] = enrich

    # Compile the results
    # enrich_df = pd.concat(enrich_df, keys=gene_sets.keys())

    # Get the sets
    # go = enrich_df.groupby(level=0).apply(lambda x: set(x.index.get_level_values(1)))

    go_sizes, go_terms = all_subsets(go_enrich)
    all_terms = pd.concat(go_terms.values())
    all_depths = all_terms['depth']
    all_median = np.median(all_depths)

    if test_sig:
        for gene_class, terms in go_terms.items():
            d = terms['depth'].values
            if len(d) < 3:
                print(gene_class, ' Skipped')
                continue

            t_med = np.median(d)
            if t_med > all_median:
                alternative = 'less'
            elif t_med < all_median:
                alternative = 'greater'
            else:
                alternative = 'two.sided'
            ks_p = d_ks_test(d, all_depths, alternative=alternative)
            print(gene_class, t_med, all_median, ks_p, sep='\t')

    return pd.concat(go_terms.values(), keys=go_terms.keys())