study, pop = read_geneset(study_fn, pop_fn, compare=args.compare) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if args.fdr: methods.append("fdr") starttime = time.clock() # obo_dag = GODag(obo_file=args.obo) obo_dag = read_data() endtime = time.clock() f = open('E:/time.txt', 'w') f.write(str(endtime - starttime)) f.close() print(str(endtime - starttime)) # save_data(obo_dag) # obo_dag=read_data() g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=args.alpha, study=study, methods=methods) endtime = time.clock() f = open('E:/time1.txt', 'w') f.write(str(endtime - starttime)) f.close() # print (endtime - starttime) g.print_summary(min_ratio=min_ratio, indent=args.indent, pval=args.pval)
p.add_option('--fdr', dest='fdr', default=False, action='store_true', help="calculate the false discovery rate (alternative to the Bonferroni correction)") p.add_option('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") (opts, args) = p.parse_args() bad = check_bad_args(args) if bad: print bad sys.exit(p.print_help()) alpha = float(opts.alpha) if opts.alpha else 0.05 min_ratio = opts.ratio if not min_ratio is None: assert 1 <= min_ratio <= 2 study_fn, pop_fn, assoc_fn = args study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare) assoc = read_associations(assoc_fn) methods=["bonferroni", "sidak", "holm"] if opts.fdr: methods.append("fdr") obo_dag = GODag(obo_file="gene_ontology.1_2.obo") g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=alpha, study=study, methods=methods) g.print_summary(min_ratio=min_ratio, indent=opts.indent)
action='store_true', help="Calculate the false discovery rate (alt. to the " "Bonferroni but slower)") p.add_option('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") (opts, args) = p.parse_args() bad = check_bad_args(args) if bad: print bad sys.exit(p.print_help()) min_ratio = opts.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)" study_fn, pop_fn, assoc_fn = args study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if opts.fdr: methods.append("fdr") obo_dag = GODag(obo_file="gene_ontology.1_2.obo") g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=opts.alpha, study=study, methods=methods) g.print_summary(min_ratio=min_ratio, indent=opts.indent, pval=opts.pval)
study, pop = read_geneset(study_fn, pop_fn, compare=args.compare) print("Study: {0} vs. Population {1}".format(len(study), len(pop)), file=sys.stderr) if not args.compare: # sanity check if len(pop) < len(study): exit("\nERROR: The study file contains more elements than the population file. " "Please check that the study file is a subset of the population file.\n") # check the fraction of genomic ids that overlap between study # and population overlap = float(len(study & pop)) / len(study) if 0.7 < overlap < 0.95: sys.stderr.write("\nWARNING: only {} fraction of genes/proteins in study are found in " "the population background.\n\n".format(overlap)) if overlap <= 0.7: exit("\nERROR: only {} of genes/proteins in the study are found in the " "background population. Please check.\n".format(overlap)) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if args.fdr: methods.append("fdr") obo_dag = GODag(obo_file=args.obo) propagate_counts = not args.no_propagate_counts g = GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, study=study, methods=methods) g.print_summary(min_ratio=min_ratio, indent=args.indent, pval=args.pval)
def FindEnrichment(obo, ontology, gi_info, gis): # Parses the list of gis into lists of gi groups, and creates a mapping from # gi to group. groups = [[int(gi) for gi in group.split(',')] for group in gis.split('|')] gi_to_group = {gi: i for i, group in enumerate(groups) for gi in group} # Creates a gi to index map for sorting purposes. gis_index = { g: i for i, g in enumerate(gi for group in groups for gi in group) } # Creates a gi to gene id mapping. gis = gis_index.keys() gi_to_gene = { gi: d['gene_id'] for d in gi_info.find({'gis': { '$in': gis }}) if 'gene_id' in d and d['gene_id'] for gi in d['gis'] } # Creatss a gene to gis list mapping. gene_to_gis = defaultdict(set) for gi, gene in gi_to_gene.iteritems(): gene_to_gis[gene].add(gi) genes = set(gi_to_gene.itervalues()) # Parameters for goatools: # Test-wise alpha for multiple testing alpha = 0.05 # Family-wise alpha (whole experiment), only print out Bonferroni p-value is # less than this value. pval = 0.05 # the population file as a comparison group. if this flag is specified, the # population is used as the study plus the `population/comparison` compare = False # only show values where the difference between study and population ratios is # greater than this. useful for excluding GO categories with small differences # but containing large numbers of genes. should be a value between 1 and 2. min_ratio = None # Calculates the false discovery rate (alt. to the Bonferroni but slower) fdr = False # Modifies the associations dictionary to become consistent with the actual term # ids and gene ids. associations = { int(k): set('GO:%s' % str(go).zfill(7) for go in v) for k, v in ontology['associations'].iteritems() } population = set(ontology['population']) study = genes methods = ['bonferroni'] # Other methods: sidak, holm, fdr obo_dag = GODag(obo_file=obo) # Performs the enrichment analysis g = GOEnrichmentStudy(population, associations, obo_dag, alpha=alpha, study=study, methods=methods) # Creates a mapping from GO term to gene ids. This is done after the analysis # since the analysis modifies the associations dictionary to include parent # terms. reverse_associations = defaultdict(set) for k, v in associations.iteritems(): for go in v: reverse_associations[go].add(k) # Inserts each record into the final json array of results. results = [] for record in g.results: # This is done by default in goatools when print to standard output. record.update_remaining_fields(min_ratio=min_ratio) if record.p_bonferroni > pval or not record.is_ratio_different: continue # Only returns enriched records. if record.enrichment != 'e': continue result = {} result['id'] = record.id level = obo_dag[result['id']].level # Filteres by GO term depth to avoid GO terms that are too general. if level < 2 and level >= 0: continue # FIlls in remaining fields. result['term'] = record.description study_ratio = _EvaluateFraction(record.ratio_in_study) population_ratio = _EvaluateFraction(record.ratio_in_pop) result['study_ratio'] = '%.4f' % study_ratio result['population_ratio'] = '%.4f' % population_ratio result['fold'] = study_ratio / population_ratio result['pval'] = '%.3g' % record.p_bonferroni # Uses the reverse associations dictionary to retrieve the list of gis that # matched the term. matched_genes = reverse_associations[result['id']] & genes matched_gis = set(gi for gene in matched_genes for gi in gene_to_gis[gene]) matched_gis &= set(gis) matched_gis = list(matched_gis) matched_gis.sort(key=lambda x: gis_index[x], reverse=True) result['all_genes'] = matched_gis # A representative set of genes is also returned in which only one gi per # group is returned. matched_groups = {gi_to_group[gi]: gi for gi in matched_gis} matched_gis = matched_groups.values() matched_gis.sort(key=lambda x: gis_index[x]) result['genes'] = matched_gis results.append(result) # Results are sorted based on how well the fold change between the study and # popualtion ratios compares to other results, and how the number of # representative genes compares to other results. folds = sorted([r['fold'] for r in results]) lengths = sorted([len(r['genes']) for r in results]) results.sort( key=lambda r: (folds.index(r['fold']) + lengths.index(len(r['genes']))) / 2.0, reverse=True) return {'results': results}
'$in': list(forward_loci) }}, {'_id': True}) ] # One of Ana's Sample study_indexDB = '/home/gstupp/01_2015_mass_spec/H1_11082014/1108_Gly1_2014_12_15_15_29205/dtaselect_results_sfp0.01_p2/DTASelect-filter.txt' study_ps = build_proteins_from_peptides.main(study_indexDB) study_loci = set(chain(*[x['forward_loci'] for x in study_ps])) study = setup_study_pop(study_loci) # One of Sandip's microbiome samples pop_indexDB = '/home/gstupp/01_2015_mass_spec/120314_SC_sampleH1sol_HCD35/DTASelect-filter.txt' pop_ps = build_proteins_from_peptides.main(pop_indexDB) pop_loci = set(chain(*[x['forward_loci'] for x in pop_ps])) pop = setup_study_pop(study_loci and pop_loci) # set up hash -> GO matching assoc = setup_association(study_loci and pop_loci) obo_dag = GODag(obo_file=os.path.expanduser("~/go/go-basic.obo")) study_sub = study[:1000] g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=0.05, study=study, methods=["fdr"]) g.print_summary(min_ratio=None, indent=False, pval=None)
def check_enrichment(study_fn, pop_fn, assoc_fn, print_summary=False, save_summary=True, savepath=None, obo_dag=None): p = optparse.OptionParser(__doc__) p.add_option('--alpha', default=0.05, type="float", help="Test-wise alpha for multiple testing " "[default: %default]") p.add_option('--pval', default=None, type="float", help="Family-wise alpha (whole experiment), only print out " "Bonferroni p-value is less than this value. " "[default: %default]") p.add_option('--compare', dest='compare', default=False, action='store_true', help="the population file as a comparison group. if this " "flag is specified, the population is used as the study " "plus the `population/comparison`") p.add_option('--ratio', dest='ratio', type='float', default=None, help="only show values where the difference between study " "and population ratios is greater than this. useful for " "excluding GO categories with small differences, but " "containing large numbers of genes. should be a value " "between 1 and 2. ") p.add_option('--fdr', dest='fdr', default=False, action='store_true', help="Calculate the false discovery rate (alt. to the " "Bonferroni but slower)") p.add_option('--indent', dest='indent', default=False, action='store_true', help="indent GO terms") (opts, args) = p.parse_args() args = [study_fn, pop_fn, assoc_fn] bad = check_bad_args(args) if bad: print(bad) sys.exit(p.print_help()) min_ratio = opts.ratio if min_ratio is not None: assert 1 <= min_ratio <= 2 assert 0 < opts.alpha < 1, "Test-wise alpha must fall between (0, 1)" study_fn, pop_fn, assoc_fn = args study, pop = read_geneset(study_fn, pop_fn, compare=opts.compare) assoc = read_associations(assoc_fn) methods = ["bonferroni", "sidak", "holm"] if opts.fdr: methods.append("fdr") if obo_dag is None: obo_file = "go-basic.obo" obo_dag = GODag(obo_file=obo_file) g = GOEnrichmentStudy(pop, assoc, obo_dag, alpha=opts.alpha, methods=methods) results = g.run_study(study) if print_summary: g.print_summary(results, min_ratio=min_ratio, indent=opts.indent, pval=opts.pval) if save_summary: if savepath is None: savepath = study_fn.replace( study_fn.split("/")[-1], "enrichment_" + study_fn.split("/")[-1]) g.wr_tsv(savepath, results)
def term_enrichment(pop_genes, gene_sets, obo_path, assoc_path, folder, condition, regenerate=False, test_sig=True, **kwargs): kwargs.setdefault('alpha', 0.05) kwargs.setdefault('methods', ["bonferroni", "sidak", "holm"]) # Setup goatools enrichment if regenerate: assoc = read_associations(assoc_path) go_dag = GODag(obo_file=obo_path) pop = set(pop_genes) g = GOEnrichmentStudy(pop, assoc, go_dag, **kwargs) # go_enrich = OrderedDict() go_enrich = OrderedDict() for gc, genes in gene_sets.items(): # Write the gene list to a file out_path = '{}/go_enrich/{}_{}_list.txt'.format(folder, condition, gc) write_gene_list(genes, out_path) enrich_path = out_path.replace('list', 'enrich') try: if regenerate: raise ValueError('Override to retrain') enrich = pd.read_csv(enrich_path, sep='\t', index_col=0) except (FileNotFoundError, ValueError) as e: r = g.run_study(frozenset(genes)) g.wr_tsv(enrich_path, r) enrich = pd.read_csv(enrich_path, sep='\t', index_col=0) enrich = enrich[(enrich.p_bonferroni < kwargs['alpha'])] go_enrich[gc] = enrich # Compile the results # enrich_df = pd.concat(enrich_df, keys=gene_sets.keys()) # Get the sets # go = enrich_df.groupby(level=0).apply(lambda x: set(x.index.get_level_values(1))) go_sizes, go_terms = all_subsets(go_enrich) all_terms = pd.concat(go_terms.values()) all_depths = all_terms['depth'] all_median = np.median(all_depths) if test_sig: for gene_class, terms in go_terms.items(): d = terms['depth'].values if len(d) < 3: print(gene_class, ' Skipped') continue t_med = np.median(d) if t_med > all_median: alternative = 'less' elif t_med < all_median: alternative = 'greater' else: alternative = 'two.sided' ks_p = d_ks_test(d, all_depths, alternative=alternative) print(gene_class, t_med, all_median, ks_p, sep='\t') return pd.concat(go_terms.values(), keys=go_terms.keys())