def fromCounts(filename): '''build annotator results from a tab-separated table with counts.''' annotator_results = [] with IOTools.openFile(filename, "r") as infile: E.info("loading data") header = infile.readline() if not header == "track\tannotation\tobserved\tcounts\n": raise ValueError("%s not a counts file: got %s" % (infile, header)) for line in infile: track, annotation, observed, counts = line[:-1].split("\t") samples = numpy.array( list(map(float, counts.split(","))), dtype=numpy.float) observed = float(observed) annotator_results.append(Engine.AnnotatorResult( track=track, annotation=annotation, counter="na", observed=observed, samples=samples)) return annotator_results
def readSegmentList(label, filenames, enable_split_tracks=False, ignore_tracks=False): """read one or more segment files. Arguments --------- label : string Label to use for IntervalCollection. filenames : list List of filenames to load in :term:`bed` format. enable_split_tracks : bool If True, allow tracks to be split across multiple files. ignore_tracks : int If True, ignore track information. Returns ------- segments : IntervalCollection The segment collection. """ results = Engine.IntervalCollection(name=label) E.info("%s: reading tracks from %i files" % (label, len(filenames))) results.load(filenames, allow_multiple=enable_split_tracks, ignore_tracks=ignore_tracks) E.info("%s: read %i tracks from %i files" % (label, len(results), len(filenames))) return results
def setUp(self): parser = gat.buildParser() options, args = parser.parse_args([]) options.segment_files = self.filename_segments options.annotation_files = self.filename_annotations options.workspace_files = self.filename_workspace self.segments, self.annotations, workspaces, isochores = gat.IO.buildSegments( options) self.workspace = gat.IO.applyIsochores(self.segments, self.annotations, workspaces, options, isochores) self.sampler = Engine.SamplerAnnotator(bucket_size=1, nbuckets=100000) self.counters = [Engine.CounterNucleotideOverlap()] self.workspace_generator = Engine.UnconditionalWorkspace() self.reference_data = gat.IO.readAnnotatorResults( 'data/output_single.tsv')
def run(segments, annotations, workspace, sampler, counters, workspace_generator, **kwargs): '''run an enrichment analysis. segments: an IntervalCollection workspace: an IntervalCollection annotations: an IntervalCollection kwargs recognized are: cache filename of cache num_samples number of samples to compute output_counts_pattern output counts to filename output_samples_pattern if given, output samles to these files, one per segment sample_files if given, read samples from these files. fdr method to compute qvalues outfiles dictionary of optional additional output files. pseudo_count pseudo_count to add to observed and expected values reference data with reference observed and expected values. ''' # get arguments num_samples = kwargs.get("num_samples", 10000) cache = kwargs.get("cache", None) output_counts_pattern = kwargs.get("output_counts_pattern", None) sample_files = kwargs.get("sample_files", []) pseudo_count = kwargs.get("pseudo_count", 1.0) reference = kwargs.get("reference", None) output_samples_pattern = kwargs.get("output_samples_pattern", None) outfiles = kwargs.get("outfiles", {}) num_threads = kwargs.get("num_threads", 0) ################################################## ################################################## ################################################## # computing summary metrics for segments if "segment_metrics" in outfiles: E.info("computing summary metrics for segments") outfile = outfiles["segment_metrics"] outfile.write("track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) for track in segments.tracks: IO.outputMetrics(outfile, segments[track], workspace, track, 'segments', ) E.info("wrote summary metrics for segments to %s" % str(outfile)) ################################################## ################################################## ################################################## # collect observed counts from segments E.info("collecting observed counts") observed_counts = [] for counter in counters: observed_counts.append(Engine.computeCounts( counter=counter, aggregator=sum, segments=segments, annotations=annotations, workspace=workspace, workspace_generator=workspace_generator)) ################################################## ################################################## ################################################## # sample and collect counts ################################################## E.info("starting sampling") if cache: E.info("samples are cached in %s" % cache) samples = Engine.SamplesCached(filename=cache) elif sample_files: if not output_samples_pattern: raise ValueError( "require output_samples_pattern if loading samples from files") # build regex regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern)) E.info("loading samples from %i files" % len(sample_files)) samples = Engine.SamplesFile( filenames=sample_files, regex=regex) else: samples = Engine.Samples() sampled_counts = {} counts = E.Counter() ntracks = len(segments.tracks) for ntrack, track in enumerate(segments.tracks): segs = segments[track] E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks)) if output_samples_pattern and not sample_files: filename = re.sub("%s", track, output_samples_pattern) E.debug("saving samples to %s" % filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if filename.endswith(".gz"): samples_outfile = gzip.open(filename, "w") else: samples_outfile = open(filename, "w") else: samples_outfile = None if workspace_generator.is_conditional: outer_sampler = ConditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) else: outer_sampler = UnconditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) counts_per_track = outer_sampler.sample( track, counts, counters, segs, annotations, workspace, outfiles) # skip empty tracks if counts_per_track is None: continue if samples_outfile: samples_outfile.close() sampled_counts[track] = counts_per_track # old code, refactor into loop to save samples if 0: E.info("sampling stats: %s" % str(counts)) if track not in samples: E.warn("no samples for track %s" % track) continue # clean up samples del samples[track] E.info("sampling finished") # build annotator results E.info("computing PValue statistics") annotator_results = list() counter_id = 0 for counter, observed_count in zip(counters, observed_counts): for track, r in observed_count.items(): for annotation, observed in r.items(): temp_segs, temp_annos, temp_workspace = workspace_generator( segments[track], annotations[annotation], workspace) # ignore empty results if temp_workspace.sum() == 0: continue # if reference is given, p-value will indicate difference # The test that track and annotation are present is done # elsewhere if reference: ref = reference[track][annotation] else: ref = None annotator_results.append(Engine.AnnotatorResultExtended( track=track, annotation=annotation, counter=counter.name, observed=observed, samples=sampled_counts[track][counter_id][annotation], track_segments=temp_segs, annotation_segments=temp_annos, workspace=temp_workspace, reference=ref, pseudo_count=pseudo_count)) counter_id += 1 # dump (large) table with counts if output_counts_pattern: for counter in counters: name = counter.name filename = re.sub("%s", name, output_counts_pattern) E.info("writing counts to %s" % filename) output = [x for x in annotator_results if x.counter == name] outfile = IOTools.openFile(filename, "w") outfile.write("track\tannotation\tobserved\tcounts\n") for o in output: outfile.write("%s\t%s\t%i\t%s\n" % (o.track, o.annotation, o.observed, ",".join(["%i" % x for x in o.samples]))) return annotator_results
def computeSample(args): '''compute a single sample. ''' workdata, samples_outfile, metrics_outfile, lock = args (track, sample_id, sampler, segs, annotations, contig_annotations, workspace, contig_workspace, counters) = workdata # E.debug("track=%s, sample=%s - started" % (track, str(sample_id))) counts = E.Counter() sample_id = str(sample_id) outf_samples = samples_outfile if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") samples_outfile.write("track name=%s\n" % sample_id) if lock: outf_samples.close() lock.release() sample = Engine.IntervalDictionary() for isochore in list(segs.keys()): counts.pairs += 1 # skip empty isochores if workspace[isochore].isEmpty or segs[isochore].isEmpty: counts.skipped += 1 continue counts.sampled += 1 r = sampler.sample(segs[isochore], workspace[isochore]) # TODO : activate # self.outputSampleStats( sample_id, isochore, r ) sample.add(isochore, r) # save sample if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") for start, end in r: outf_samples.write("%s\t%i\t%i\n" % (isochore, start, end)) if lock: outf_samples.close() lock.release() # re-combine isochores # adjacent intervals are merged. sample.fromIsochores() if metrics_outfile: if lock: lock.acquire() outf = IOTools.openFile(metrics_outfile, "a") else: outf = metrics_outfile IO.outputMetrics(outf, sample, workspace, track, sample_id) if lock: outf.close() lock.release() counts_per_track = [collections.defaultdict(float) for x in counters] # compute counts for each counter for counter_id, counter in enumerate(counters): # TODO: choose aggregator for annotation in annotations.tracks: counts_per_track[counter_id][annotation] = sum([ counter(sample[contig], contig_annotations[annotation][contig], contig_workspace[contig]) for contig in list(sample.keys())]) # E.debug("track=%s, sample=%s - completed" % (track,str(sample_id ))) return counts_per_track
def buildSegments(options): '''load segments, annotations and workspace from parameters defined in *options*. The workspace will be split by isochores. returns segments, annotations and workspace. ''' options.segment_files = expandGlobs(options.segment_files) options.annotation_files = expandGlobs(options.annotation_files) options.workspace_files = expandGlobs(options.workspace_files) options.sample_files = expandGlobs(options.sample_files) ################################################## # arguments sanity check if not options.segment_files: raise ValueError("please specify at least one segment file") if not options.annotation_files: raise ValueError("please specify at least one annotation file") if not options.workspace_files: raise ValueError("please specify at least one workspace file") # read one or more segment files segments = readSegmentList("segments", options.segment_files, ignore_tracks=options.ignore_segment_tracks) segments.normalize() if segments.sum() == 0: E.critical("no segments in input file - run aborted") raise ValueError("segments file is empty - run aborted") if len(segments) > 1000: raise ValueError("too many (%i) segment files - use track definitions " "or --ignore-segment-tracks" % len(segments)) annotations = readSegmentList( "annotations", options.annotation_files, enable_split_tracks=options.enable_split_tracks, ignore_tracks=options.annotations_label is not None) if options.annotations_label is not None: annotations.setName(options.annotations_label) if options.annotations_to_points: annotations.toPositions(options.annotations_to_points) if options.overlapping_annotations: # only sort, do not merge annotations.sort() else: annotations.normalize() workspaces = readSegmentList("workspaces", options.workspace_files, options, options.enable_split_tracks) workspaces.normalize() # intersect workspaces to build a single workspace E.info("collapsing workspaces") dumpStats(workspaces, "stats_workspaces_input", options) workspaces.collapse() dumpStats(workspaces, "stats_workspaces_collapsed", options) # use merged workspace only, discard others workspaces.restrict("collapsed") # build isochores or intersect annotations/segments with workspace if options.isochore_files: # read one or more isochore files isochores = Engine.IntervalCollection(name="isochores") E.info("%s: reading isochores from %i files" % ("isochores", len(options.isochore_files))) isochores.load(options.isochore_files) dumpStats(isochores, "stats_isochores_raw", options) # merge isochores and check if consistent (fully normalized) isochores.sort() # check that there are no overlapping segments within isochores isochores.check() # TODO: flag is_normalized not properly set isochores.normalize() # check that there are no overlapping segments between isochores # truncate isochores to workspace # crucial if isochores are larger than workspace. isochores.intersect(workspaces["collapsed"]) else: isochores = None return segments, annotations, workspaces, isochores
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = Engine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [ x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues) ] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write("\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write("\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = Engine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues)] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write( "\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write( "\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def fromSegments(options, args): '''run analysis from segment files. This is the most common use case. ''' tstart = time.time() # build segments segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # open various additional output files outfiles = {} for section in ( "sample", "segment_metrics", "sample_metrics", ): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, "section")]) > 0: outfiles[section] = E.openOutputFile(section) if 'sample_metrics' in outfiles: outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores, truncate_segments_to_workspace=options.truncate_segments_to_workspace, truncate_workspace_to_annotations=options. truncate_workspace_to_annotations, restrict_workspace=options.restrict_workspace) # check memory requirements # previous algorithm: memory requirements if all samples are stored # counts = segments.countsPerTrack() # max_counts = max(counts.values()) # memory = 8 * 2 * options.num_samples * max_counts * len(workspace) # initialize sampler if options.sampler == "annotator": sampler = Engine.SamplerAnnotator(bucket_size=options.bucket_size, nbuckets=options.nbuckets) elif options.sampler == "shift": sampler = Engine.SamplerShift(radius=options.shift_expansion, extension=options.shift_extension) elif options.sampler == "segments": sampler = Engine.SamplerSegments() elif options.sampler == "local-permutation": sampler = Engine.SamplerLocalPermutation() elif options.sampler == "global-permutation": sampler = Engine.SamplerGlobalPermutation() elif options.sampler == "brute-force": sampler = Engine.SamplerBruteForce() elif options.sampler == "uniform": sampler = Engine.SamplerUniform() # initialize counter counters = [] for counter in options.counters: if counter == "nucleotide-overlap": counters.append(Engine.CounterNucleotideOverlap()) elif counter == "nucleotide-density": counters.append(Engine.CounterNucleotideDensity()) elif counter == "segment-overlap": counters.append(Engine.CounterSegmentOverlap()) elif counter == "annotation-overlap": counters.append(Engine.CounterAnnotationOverlap()) elif counter == "segment-midoverlap": counters.append(Engine.CounterSegmentMidpointOverlap()) elif counter == "annotation-midoverlap": counters.append(Engine.CounterAnnotationMidpointOverlap()) else: raise ValueError("unknown counter '%s'" % counter) # initialize workspace generator if options.conditional == "unconditional": workspace_generator = Engine.UnconditionalWorkspace() elif options.conditional == "cooccurance": workspace_generator = Engine.ConditionalWorkspaceCooccurance() elif options.conditional == "annotation-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, options.conditional_expansion) elif options.conditional == "segment-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, options.conditional_expansion) else: raise ValueError("unknown conditional workspace '%s'" % options.conditional) # check if reference is compplete if options.reference: for track in segments.tracks: if track not in options.reference: raise ValueError("missing track '%s' in reference" % track) r = options.reference[track] for annotation in annotations.tracks: if annotation not in r: raise ValueError( "missing annotation '%s' in annotations for " "track='%s'" % (annotation, track)) # compute annotator_results = gat.run( segments, annotations, workspace, sampler, counters, workspace_generator=workspace_generator, num_samples=options.num_samples, cache=options.cache, outfiles=outfiles, output_counts_pattern=options.output_counts_pattern, output_samples_pattern=options.output_samples_pattern, sample_files=options.sample_files, conditional=options.conditional, conditional_extension=options.conditional_extension, reference=options.reference, pseudo_count=options.pseudo_count, num_threads=options.num_threads) return annotator_results
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = gat.buildParser(usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) ################################################## size_pos, size_segment = SegmentList.getSegmentSize() E.debug("sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos))) ################################################## # set default counter if not options.counters: options.counters.append("nucleotide-overlap") ################################################## if options.output_tables_pattern is not None: if "%s" not in options.output_tables_pattern: raise ValueError( "output_tables_pattern should contain at least one '%s'") if options.output_samples_pattern is not None: if "%s" not in options.output_samples_pattern: raise ValueError( "output_samples_pattern should contain at least one '%s'") if options.output_counts_pattern is not None: if "%s" not in options.output_counts_pattern: raise ValueError( "output_counts_pattern should contain at least one '%s'") if options.random_seed is not None: # initialize python random number generator random.seed(options.random_seed) # initialize numpy random number generator numpy.random.seed(options.random_seed) ################################################## # read fold changes that results should be compared with if options.null != "default": if not os.path.exists(options.null): raise OSError("file %s not found" % options.null) E.info("reading reference results from %s" % options.null) options.reference = IO.readAnnotatorResults(options.null) else: options.reference = None if options.input_filename_counts: # use pre-computed counts annotator_results = Engine.fromCounts(options.input_filename_counts) elif options.input_filename_results: # use previous results (re-computes fdr) E.info("reading gat results from %s" % options.input_filename_results) annotator_results = IO.readAnnotatorResults( options.input_filename_results) else: # do full gat analysis annotator_results = fromSegments(options, args) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) Engine.updatePValues(annotator_results, options.pvalue_method) ################################################## # output IO.outputResults(annotator_results, options, Engine.AnnotatorResultExtended.headers, description_header, description_width, descriptions) IO.plotResults(annotator_results, options) # write footer and output benchmark information. E.Stop()