def fromCounts(filename): '''build annotator results from a tab-separated table with counts.''' annotator_results = [] with IOTools.openFile(filename, "r") as infile: E.info("loading data") header = infile.readline() if not header == "track\tannotation\tobserved\tcounts\n": raise ValueError("%s not a counts file: got %s" % (infile, header)) for line in infile: track, annotation, observed, counts = line[:-1].split("\t") samples = numpy.array( list(map(float, counts.split(","))), dtype=numpy.float) observed = float(observed) annotator_results.append(Engine.AnnotatorResult( track=track, annotation=annotation, counter="na", observed=observed, samples=samples)) return annotator_results
def readFromBedOld(filenames, name="track"): '''read Segment Lists from one or more bed files. Segment lists are grouped by *contig* and *track*. If no track is given, the *name* attribute is taken. ''' segment_lists = collections.defaultdict( lambda: collections.defaultdict(Engine.SegmentList)) if name == "track": f = lambda x: x.mTrack["name"] elif name == "name": f = lambda x: x.mFields[0] else: raise ValueError("unknown name: '%s'" % name) for filename in filenames: infile = IOTools.openFile(filename, "r") for bed in Bed.iterator(infile): try: name = f(bed) except TypeError: name = "default" segment_lists[name][bed.contig].add(bed.start, bed.end) return segment_lists
def readDescriptions(options): '''read descriptions from tab separated file.''' description_header, descriptions, description_width = [], {}, 0 if options.input_filename_descriptions: E.info("reading descriptions from %s" % options.input_filename_descriptions) with IOTools.openFile(options.input_filename_descriptions) as inf: first = True for line in inf: if line.startswith("#"): continue data = line[:-1].split("\t") if description_width: assert len(data) - 1 == description_width, \ "inconsistent number of descriptions in %s" %\ options.input_filename_descriptions else: description_width = len(data) - 1 if first: description_header = data[1:] first = False else: descriptions[data[0]] = data[1:] assert len(description_header) == description_width, \ "number of descriptions (%i) inconsistent with header (%s) in %s" % \ (description_width, len(description_header), options.input_filename_descriptions) return description_header, descriptions, description_width
def readDescriptions(options): '''read descriptions from tab separated file.''' description_header, descriptions, description_width = [], {}, 0 if options.input_filename_descriptions: E.info("reading descriptions from %s" % options.input_filename_descriptions) with IOTools.openFile(options.input_filename_descriptions) as inf: first = True for line in inf: if line.startswith("#"): continue data = line[:-1].split("\t") if description_width: assert len( data) - 1 == description_width, "inconsistent number of descriptions in %s" % options.input_filename_descriptions else: description_width = len(data) - 1 if first: description_header = data[1:] first = False else: descriptions[data[0]] = data[1:] assert len(description_header) == description_width, "number of descriptions (%i) inconsistent with header (%s) in %s" % \ (description_width, len(description_header), options.input_filename_descriptions) return description_header, descriptions, description_width
def readAnnotatorResults(filename): '''load annotator results from a tab-separated results table.''' annotator_results = [] with IOTools.openFile(filename, "r") as infile: for line in infile: if line.startswith("#"): continue if line.startswith("track"): continue r = gat.DummyAnnotatorResult._fromLine(line) annotator_results.append(r) return annotator_results
def run(segments, annotations, workspace, sampler, counters, workspace_generator, **kwargs): '''run an enrichment analysis. segments: an IntervalCollection workspace: an IntervalCollection annotations: an IntervalCollection kwargs recognized are: cache filename of cache num_samples number of samples to compute output_counts_pattern output counts to filename output_samples_pattern if given, output samles to these files, one per segment sample_files if given, read samples from these files. fdr method to compute qvalues outfiles dictionary of optional additional output files. pseudo_count pseudo_count to add to observed and expected values reference data with reference observed and expected values. ''' # get arguments num_samples = kwargs.get("num_samples", 10000) cache = kwargs.get("cache", None) output_counts_pattern = kwargs.get("output_counts_pattern", None) sample_files = kwargs.get("sample_files", []) pseudo_count = kwargs.get("pseudo_count", 1.0) reference = kwargs.get("reference", None) output_samples_pattern = kwargs.get("output_samples_pattern", None) outfiles = kwargs.get("outfiles", {}) num_threads = kwargs.get("num_threads", 0) ################################################## ################################################## ################################################## # computing summary metrics for segments if "segment_metrics" in outfiles: E.info("computing summary metrics for segments") outfile = outfiles["segment_metrics"] outfile.write("track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) for track in segments.tracks: IO.outputMetrics(outfile, segments[track], workspace, track, 'segments', ) E.info("wrote summary metrics for segments to %s" % str(outfile)) ################################################## ################################################## ################################################## # collect observed counts from segments E.info("collecting observed counts") observed_counts = [] for counter in counters: observed_counts.append(Engine.computeCounts( counter=counter, aggregator=sum, segments=segments, annotations=annotations, workspace=workspace, workspace_generator=workspace_generator)) ################################################## ################################################## ################################################## # sample and collect counts ################################################## E.info("starting sampling") if cache: E.info("samples are cached in %s" % cache) samples = Engine.SamplesCached(filename=cache) elif sample_files: if not output_samples_pattern: raise ValueError( "require output_samples_pattern if loading samples from files") # build regex regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern)) E.info("loading samples from %i files" % len(sample_files)) samples = Engine.SamplesFile( filenames=sample_files, regex=regex) else: samples = Engine.Samples() sampled_counts = {} counts = E.Counter() ntracks = len(segments.tracks) for ntrack, track in enumerate(segments.tracks): segs = segments[track] E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks)) if output_samples_pattern and not sample_files: filename = re.sub("%s", track, output_samples_pattern) E.debug("saving samples to %s" % filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if filename.endswith(".gz"): samples_outfile = gzip.open(filename, "w") else: samples_outfile = open(filename, "w") else: samples_outfile = None if workspace_generator.is_conditional: outer_sampler = ConditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) else: outer_sampler = UnconditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) counts_per_track = outer_sampler.sample( track, counts, counters, segs, annotations, workspace, outfiles) # skip empty tracks if counts_per_track is None: continue if samples_outfile: samples_outfile.close() sampled_counts[track] = counts_per_track # old code, refactor into loop to save samples if 0: E.info("sampling stats: %s" % str(counts)) if track not in samples: E.warn("no samples for track %s" % track) continue # clean up samples del samples[track] E.info("sampling finished") # build annotator results E.info("computing PValue statistics") annotator_results = list() counter_id = 0 for counter, observed_count in zip(counters, observed_counts): for track, r in observed_count.items(): for annotation, observed in r.items(): temp_segs, temp_annos, temp_workspace = workspace_generator( segments[track], annotations[annotation], workspace) # ignore empty results if temp_workspace.sum() == 0: continue # if reference is given, p-value will indicate difference # The test that track and annotation are present is done # elsewhere if reference: ref = reference[track][annotation] else: ref = None annotator_results.append(Engine.AnnotatorResultExtended( track=track, annotation=annotation, counter=counter.name, observed=observed, samples=sampled_counts[track][counter_id][annotation], track_segments=temp_segs, annotation_segments=temp_annos, workspace=temp_workspace, reference=ref, pseudo_count=pseudo_count)) counter_id += 1 # dump (large) table with counts if output_counts_pattern: for counter in counters: name = counter.name filename = re.sub("%s", name, output_counts_pattern) E.info("writing counts to %s" % filename) output = [x for x in annotator_results if x.counter == name] outfile = IOTools.openFile(filename, "w") outfile.write("track\tannotation\tobserved\tcounts\n") for o in output: outfile.write("%s\t%s\t%i\t%s\n" % (o.track, o.annotation, o.observed, ",".join(["%i" % x for x in o.samples]))) return annotator_results
def computeSample(args): '''compute a single sample. ''' workdata, samples_outfile, metrics_outfile, lock = args (track, sample_id, sampler, segs, annotations, contig_annotations, workspace, contig_workspace, counters) = workdata # E.debug("track=%s, sample=%s - started" % (track, str(sample_id))) counts = E.Counter() sample_id = str(sample_id) outf_samples = samples_outfile if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") samples_outfile.write("track name=%s\n" % sample_id) if lock: outf_samples.close() lock.release() sample = Engine.IntervalDictionary() for isochore in list(segs.keys()): counts.pairs += 1 # skip empty isochores if workspace[isochore].isEmpty or segs[isochore].isEmpty: counts.skipped += 1 continue counts.sampled += 1 r = sampler.sample(segs[isochore], workspace[isochore]) # TODO : activate # self.outputSampleStats( sample_id, isochore, r ) sample.add(isochore, r) # save sample if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") for start, end in r: outf_samples.write("%s\t%i\t%i\n" % (isochore, start, end)) if lock: outf_samples.close() lock.release() # re-combine isochores # adjacent intervals are merged. sample.fromIsochores() if metrics_outfile: if lock: lock.acquire() outf = IOTools.openFile(metrics_outfile, "a") else: outf = metrics_outfile IO.outputMetrics(outf, sample, workspace, track, sample_id) if lock: outf.close() lock.release() counts_per_track = [collections.defaultdict(float) for x in counters] # compute counts for each counter for counter_id, counter in enumerate(counters): # TODO: choose aggregator for annotation in annotations.tracks: counts_per_track[counter_id][annotation] = sum([ counter(sample[contig], contig_annotations[annotation][contig], contig_workspace[contig]) for contig in list(sample.keys())]) # E.debug("track=%s, sample=%s - completed" % (track,str(sample_id ))) return counts_per_track
def expandGlobs(infiles): return IOTools.flatten([glob.glob(x) for x in infiles])
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = Engine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [ x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues) ] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write("\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write("\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = GatEngine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues)] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write( "\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write( "\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--gene-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations - here, location of genes [default=%default].") parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help="filename with segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-g", "--number-of-genes", dest="number_of_genes", type="int", help="total number of genes [default=%default]") parser.add_option("-m", "--annotation-file", dest="annotation_file", type="string", help="filename mapping genes to annotations [default=%default]") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help="ignore segment tracks - all segments belong to one track [default=%default]") parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]") parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.set_defaults( annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], annotation_file=None, num_samples=1000, nbuckets=100000, bucket_size=1, counter="nucleotide-overlap", output_stats=[], output_bed=[], output_filename_counts=None, output_order="fold", cache=None, input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", sampler="annotator", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", number_of_genes=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() # load segments options.segment_files = IO.expandGlobs(options.segment_files) options.annotation_files = IO.expandGlobs(options.annotation_files) options.workspace_files = IO.expandGlobs(options.workspace_files) # read one or more segment files segments = IO.readSegmentList("segments", options.segment_files, options) if options.ignore_segment_tracks: segments.merge(delete=True) E.info("merged all segments into one track with %i segments" % len(segments)) if len(segments) > 1000: raise ValueError( "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments)) # load workspace workspaces = IO.readSegmentList( "workspaces", options.workspace_files, options, options.enable_split_tracks) # intersect workspaces to build a single workspace E.info("collapsing workspaces") workspaces.collapse() # use merged workspace only, discard others workspaces.restrict("collapsed") workspace = workspaces["collapsed"] E.info("intervals loaded in %i seconds" % (time.time() - tstart)) ############################################ # load table mapping a gene id to annotations gene2annotations = IOTools.readMultiMap(IOTools.openFile(options.annotation_file), has_header=True) annotations = set([y for x in gene2annotations.values() for y in x]) E.info("loaded %i annotations for %i genes" % (len(gene2annotations), len(annotations))) ############################################ # load bed file with gene coordinates assert len(options.annotation_files) == 1 indexed_genes = collections.defaultdict(Intersecter) total_genes = 0 # number of genes per contig contig2ngenes = collections.defaultdict(int) # compute number of genes with a particular annotation # per contig annotation2ngenes = collections.defaultdict(int) for line in IOTools.openFile(options.annotation_files[0]): if line.startswith("#"): continue contig, start, end, gene_id = line[:-1].split("\t")[:4] indexed_genes[contig].add_interval( Interval(int(start), int(end), gene_id)) contig2ngenes[contig] += 1 total_genes += 1 try: for annotation in gene2annotations[gene_id]: annotation2ngenes[annotation] += 1 except KeyError: pass E.info("indexed locations for %i contigs" % len(indexed_genes)) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute results E.info("computing counts") results = [] # iterate over segments for segment, segmentdict in segments.iteritems(): # genes hit by segments per annotation genes_hit_by_segments_with_annotations = collections.defaultdict(int) # genes hit by segments genes_hit_by_segments = 0 for contig, ss in segmentdict.iteritems(): for start, end in ss: overlapping_genes = list( indexed_genes[contig].find(start, end)) genes_hit_by_segments += len(overlapping_genes) for x in overlapping_genes: gene_id = x.value try: for annotation in gene2annotations[gene_id]: genes_hit_by_segments_with_annotations[ annotation] += 1 except KeyError: pass # N = number of genes in genome N = total_genes # n = number of genes selected by segments n = genes_hit_by_segments for annotation in annotations: # K = number of genes carrying annotation K = annotation2ngenes[annotation] # k = number of genes selected by segments and with annotation k = genes_hit_by_segments_with_annotations[annotation] if n == 0 or N == 0 or K == 0: expected = 0 fold = 1.0 pvalue = 1.0 else: expected = float(n * K) / N fold = k / expected pvalue = scipy.stats.hypergeom.sf(k - 1, N, K, n) r = GENESET_RESULT._make(( segment, annotation, N, K, n, k, expected, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GENESET_RESULT._fields, description_header, description_width, descriptions) E.Stop()