def readDescriptions(options): '''read descriptions from tab separated file.''' description_header, descriptions, description_width = [], {}, 0 if options.input_filename_descriptions: E.info("reading descriptions from %s" % options.input_filename_descriptions) with IOTools.openFile(options.input_filename_descriptions) as inf: first = True for line in inf: if line.startswith("#"): continue data = line[:-1].split("\t") if description_width: assert len(data) - 1 == description_width, \ "inconsistent number of descriptions in %s" %\ options.input_filename_descriptions else: description_width = len(data) - 1 if first: description_header = data[1:] first = False else: descriptions[data[0]] = data[1:] assert len(description_header) == description_width, \ "number of descriptions (%i) inconsistent with header (%s) in %s" % \ (description_width, len(description_header), options.input_filename_descriptions) return description_header, descriptions, description_width
def readSegmentList(label, filenames, enable_split_tracks=False, ignore_tracks=False): """read one or more segment files. Arguments --------- label : string Label to use for IntervalCollection. filenames : list List of filenames to load in :term:`bed` format. enable_split_tracks : bool If True, allow tracks to be split across multiple files. ignore_tracks : int If True, ignore track information. Returns ------- segments : IntervalCollection The segment collection. """ results = Engine.IntervalCollection(name=label) E.info("%s: reading tracks from %i files" % (label, len(filenames))) results.load(filenames, allow_multiple=enable_split_tracks, ignore_tracks=ignore_tracks) E.info("%s: read %i tracks from %i files" % (label, len(results), len(filenames))) return results
def fromCounts(filename): '''build annotator results from a tab-separated table with counts.''' annotator_results = [] with IOTools.openFile(filename, "r") as infile: E.info("loading data") header = infile.readline() if not header == "track\tannotation\tobserved\tcounts\n": raise ValueError("%s not a counts file: got %s" % (infile, header)) for line in infile: track, annotation, observed, counts = line[:-1].split("\t") samples = numpy.array( list(map(float, counts.split(","))), dtype=numpy.float) observed = float(observed) annotator_results.append(Engine.AnnotatorResult( track=track, annotation=annotation, counter="na", observed=observed, samples=samples)) return annotator_results
def __init__(self, num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=1): self.num_samples = num_samples self.samples = samples self.samples_outfile = samples_outfile self.sampler = sampler self.workspace_generator = workspace_generator self.counters = counters self.outfile_sample_stats = outfiles.get("sample_stats", None) self.outfile_sample_metrics = outfiles.get("sample_metrics", None) if self.outfile_sample_stats: E.debug("sample stats go to %s" % self.outfile_sample_stats) self.outfile_sample_stats.write( "sample\tisochore\tnsegments\tnnucleotides\tmean\t" "std\tmin\tq1\tmedian\tq3\tmax\n") self.last_sample_id = None self.all_lengths = [] self.num_threads = num_threads
def readDescriptions(options): '''read descriptions from tab separated file.''' description_header, descriptions, description_width = [], {}, 0 if options.input_filename_descriptions: E.info("reading descriptions from %s" % options.input_filename_descriptions) with IOTools.openFile(options.input_filename_descriptions) as inf: first = True for line in inf: if line.startswith("#"): continue data = line[:-1].split("\t") if description_width: assert len( data) - 1 == description_width, "inconsistent number of descriptions in %s" % options.input_filename_descriptions else: description_width = len(data) - 1 if first: description_header = data[1:] first = False else: descriptions[data[0]] = data[1:] assert len(description_header) == description_width, "number of descriptions (%i) inconsistent with header (%s) in %s" % \ (description_width, len(description_header), options.input_filename_descriptions) return description_header, descriptions, description_width
def readSegmentList(label, filenames, options, enable_split_tracks=False): # read one or more segment files results = GatEngine.IntervalCollection(name=label) E.info("%s: reading tracks from %i files" % (label, len(filenames))) results.load(filenames, split_tracks=enable_split_tracks) E.info("%s: read %i tracks from %i files" % (label, len(results), len(filenames))) dumpStats(results, "stats_%s_raw" % label, options) results.normalize() dumpStats(results, "stats_%s_normed" % label, options) return results
def computeSamples(self, work, report_interval=100): '''compute samples according to work. returns a list of results. ''' n = len(work) E.debug('sampling will work on %i items' % n) results = [] if self.num_threads == 0: for i, w in enumerate(work): r = computeSample( (w, self.samples_outfile, self.outfile_sample_metrics, None)) if i % report_interval == 0: E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n)) results.append(r) else: E.info("generating processpool with %i threads for %i items" % (self.num_threads, len(work))) manager = multiprocessing.Manager() lock = manager.Lock() pool = multiprocessing.Pool(self.num_threads) # use file names - not files when multiprocessing samples_outfile, metrics_outfile = None, None if self.samples_outfile: samples_outfile = self.samples_outfile.name self.samples_outfile.flush() if self.outfile_sample_metrics: metrics_outfile = self.outfile_sample_metrics.name self.outfile_sample_metrics.flush() ww = [(w, samples_outfile, metrics_outfile, lock) for w in work] for i, r in enumerate(pool.imap_unordered(computeSample, ww)): if i % report_interval == 0: E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n)) results.append(r) pool.close() pool.join() return results
def run(segments, annotations, workspace, sampler, counters, workspace_generator, **kwargs): '''run an enrichment analysis. segments: an IntervalCollection workspace: an IntervalCollection annotations: an IntervalCollection kwargs recognized are: cache filename of cache num_samples number of samples to compute output_counts_pattern output counts to filename output_samples_pattern if given, output samles to these files, one per segment sample_files if given, read samples from these files. fdr method to compute qvalues outfiles dictionary of optional additional output files. pseudo_count pseudo_count to add to observed and expected values reference data with reference observed and expected values. ''' # get arguments num_samples = kwargs.get("num_samples", 10000) cache = kwargs.get("cache", None) output_counts_pattern = kwargs.get("output_counts_pattern", None) sample_files = kwargs.get("sample_files", []) pseudo_count = kwargs.get("pseudo_count", 1.0) reference = kwargs.get("reference", None) output_samples_pattern = kwargs.get("output_samples_pattern", None) outfiles = kwargs.get("outfiles", {}) num_threads = kwargs.get("num_threads", 0) ################################################## ################################################## ################################################## # computing summary metrics for segments if "segment_metrics" in outfiles: E.info("computing summary metrics for segments") outfile = outfiles["segment_metrics"] outfile.write("track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) for track in segments.tracks: IO.outputMetrics(outfile, segments[track], workspace, track, 'segments', ) E.info("wrote summary metrics for segments to %s" % str(outfile)) ################################################## ################################################## ################################################## # collect observed counts from segments E.info("collecting observed counts") observed_counts = [] for counter in counters: observed_counts.append(Engine.computeCounts( counter=counter, aggregator=sum, segments=segments, annotations=annotations, workspace=workspace, workspace_generator=workspace_generator)) ################################################## ################################################## ################################################## # sample and collect counts ################################################## E.info("starting sampling") if cache: E.info("samples are cached in %s" % cache) samples = Engine.SamplesCached(filename=cache) elif sample_files: if not output_samples_pattern: raise ValueError( "require output_samples_pattern if loading samples from files") # build regex regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern)) E.info("loading samples from %i files" % len(sample_files)) samples = Engine.SamplesFile( filenames=sample_files, regex=regex) else: samples = Engine.Samples() sampled_counts = {} counts = E.Counter() ntracks = len(segments.tracks) for ntrack, track in enumerate(segments.tracks): segs = segments[track] E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks)) if output_samples_pattern and not sample_files: filename = re.sub("%s", track, output_samples_pattern) E.debug("saving samples to %s" % filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if filename.endswith(".gz"): samples_outfile = gzip.open(filename, "w") else: samples_outfile = open(filename, "w") else: samples_outfile = None if workspace_generator.is_conditional: outer_sampler = ConditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) else: outer_sampler = UnconditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) counts_per_track = outer_sampler.sample( track, counts, counters, segs, annotations, workspace, outfiles) # skip empty tracks if counts_per_track is None: continue if samples_outfile: samples_outfile.close() sampled_counts[track] = counts_per_track # old code, refactor into loop to save samples if 0: E.info("sampling stats: %s" % str(counts)) if track not in samples: E.warn("no samples for track %s" % track) continue # clean up samples del samples[track] E.info("sampling finished") # build annotator results E.info("computing PValue statistics") annotator_results = list() counter_id = 0 for counter, observed_count in zip(counters, observed_counts): for track, r in observed_count.items(): for annotation, observed in r.items(): temp_segs, temp_annos, temp_workspace = workspace_generator( segments[track], annotations[annotation], workspace) # ignore empty results if temp_workspace.sum() == 0: continue # if reference is given, p-value will indicate difference # The test that track and annotation are present is done # elsewhere if reference: ref = reference[track][annotation] else: ref = None annotator_results.append(Engine.AnnotatorResultExtended( track=track, annotation=annotation, counter=counter.name, observed=observed, samples=sampled_counts[track][counter_id][annotation], track_segments=temp_segs, annotation_segments=temp_annos, workspace=temp_workspace, reference=ref, pseudo_count=pseudo_count)) counter_id += 1 # dump (large) table with counts if output_counts_pattern: for counter in counters: name = counter.name filename = re.sub("%s", name, output_counts_pattern) E.info("writing counts to %s" % filename) output = [x for x in annotator_results if x.counter == name] outfile = IOTools.openFile(filename, "w") outfile.write("track\tannotation\tobserved\tcounts\n") for o in output: outfile.write("%s\t%s\t%i\t%s\n" % (o.track, o.annotation, o.observed, ",".join(["%i" % x for x in o.samples]))) return annotator_results
def sample(self, track, counts, counters, segs, annotations, workspace, outfiles): '''sample and return counts. Return a list of counted results for each counter. ''' E.info("performing unconditional sampling") counts_per_track = [collections.defaultdict(list) for x in counters] # rebuild non-isochore annotations and workspace contig_annotations = annotations.clone() contig_annotations.fromIsochores() contig_annotations.setName("contig_" + annotations.getName()) contig_workspace = workspace.clone() contig_workspace.fromIsochores() E.info("workspace without conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) temp_segs, _, temp_workspace = self.workspace_generator( segs, None, workspace) E.info("workspace after conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) if workspace.sum() == 0: E.warn("empty workspace - no computation performed") return None work = [WorkData(track, x, self.sampler, temp_segs, annotations, contig_annotations, temp_workspace, contig_workspace, counters, ) for x in range(self.num_samples)] if self.num_threads > 0: E.info("setting up shared data for multi-processing") annotations.share() contig_annotations.share() contig_workspace.share("contig_workspace") temp_segs.share("generated_segments") temp_workspace.share("generated_workspace") E.info("sampling started") results = self.computeSamples(work) E.info("sampling completed") if self.num_threads > 0: E.info("retrieving private data") annotations.unshare() contig_annotations.unshare() contig_workspace.unshare() temp_segs.unshare() temp_workspace.unshare() # collate results for result in results: for counter_id, counter in enumerate(counters): for annotation in annotations.tracks: counts_per_track[counter_id][annotation].append( result[counter_id][annotation]) self.outputSampleStats(None, "", []) return counts_per_track
def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations [default=%default].") parser.add_option( "-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help= "filename with segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help= "filename with workspace segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", help= "filename with isochore segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help= "ignore segment tracks - all segments belong to one track [default=%default]" ) parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]" ) parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", help="restrict workspace to those segments that contain both track" " and annotations [default=%default]") parser.add_option("--counter", dest="counters", type="choice", action="append", choices=("binom", "hyperg"), help="counter to use [default=%default].") parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", help= "output pattern for result tables. Used if there are multiple counters used [default=%default]." ) parser.set_defaults(annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], counters=[], output_stats=[], output_bed=[], output_tables_pattern="%s.tsv.gz", output_order="fold", input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", truncate_workspace_to_annotations=False, truncate_segments_to_workspace=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() if len(options.counters) == 0: options.counters.append("binom") ############################################ segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # filter segments by workspace workspace = IO.applyIsochores(segments, annotations, workspaces, options, isochores) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute per contig # compute bases covered by workspace workspace2basecoverage, isochores = {}, [] for contig, ww in workspace.iteritems(): workspace2basecoverage[contig] = ww.sum() isochores.append(contig) # compute percentage of bases covered by annotations in workspace # per isochore annotation2basecoverage = collections.defaultdict(dict) for annotation, aa in annotations.iteritems(): for isochore, a in aa.iteritems(): # need to truncate to workspace? annotation2basecoverage[annotation][isochore] = a.sum() results_per_contig = collections.defaultdict(list) E.info("computing counts per isochore") # results per isochore def emptyResult(segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace): return GREAT_RESULT._make(( segment, annotation, isochore, counter, 0, # observed 0, # expected nsegments_in_workspace, 0, # nannotations_in_workspace 0, # nsegments_overlapping_annotation 0, # nannotations_overlapping_segments 0, # basecoverage_intersection 0, # basecoverage_segments basecoverage_annotation, basecoverage_workspace, 0.0, 1.0, 1.0, 1.0)) for isochore in isochores: basecoverage_workspace = workspace2basecoverage[isochore] # iterate over all isochores for segment, segmentdict in segments.iteritems(): try: ss = segmentdict[isochore] # select segments overlapping workspace segments_in_workspace = GatSegmentList.SegmentList(clone=ss) segments_in_workspace.intersect(workspace[isochore]) # number of segments in workspace nsegments_in_workspace = len(segments_in_workspace) except KeyError: ss = None basecoverage_segments = segments_in_workspace.sum() for annotation, annotationdict in annotations.iteritems(): # if annotation != "GO:0030957": continue try: aa = annotationdict[isochore] except KeyError: aa = None # p_A: proportion of bases covered by annotation try: basecoverage_annotation = annotation2basecoverage[ annotation][isochore] except KeyError: basecoverage_annotation = 0 if ss == None or aa == None: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append( emptyResult( segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue # select segments overlapping annotation segments_overlapping_annotation = GatSegmentList.SegmentList( clone=ss) segments_overlapping_annotation.intersect( annotations[annotation][isochore]) # number of segments in annotation nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore], mode=options.overlap_mode) # number of nucleotides at the intersection of segments, # annotation and workspace basecoverage_intersection = segments_overlapping_annotation.sum( ) annotations_overlapping_segments = GatSegmentList.SegmentList( clone=aa) annotations_overlapping_segments.intersect(ss) nannotations_overlapping_segments = len( annotations_overlapping_segments) nannotations_in_workspace = len(aa) if nannotations_in_workspace == 0: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append( emptyResult( segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) fraction_hit_annotation = float( nannotations_overlapping_segments ) / nannotations_in_workspace for counter in options.counters: if counter.startswith("binom"): # GREAT binomial probability over "regions" # n = number of genomic regions = nannotations_in_workspace # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation # kpi = genomic regions with annotation hit by segments = nannotations_in_segments # sf = survival functions = 1 -cdf # probability of observing >kpi in a sample of n where the probabily of succes is # ppi. pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * \ nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): # hypergeometric probability over nucleotides # Sampling without replacement # x,M,n,M # x = observed number of nucleotides in overlap of segments,annotations and workspace # M = number of nucleotides in workspace # n = number of nucleotides in annotations (and workspace) # N = number of nucleotides in segments (and workspace) # P-value of obtaining >x number of nucleotides # overlapping. rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make( (segment, annotation, isochore, counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) # print "\t".join( map(str, r)) results_per_contig[(counter, segment, annotation)].append(r) E.info("merging counts per isochore") # compute sums results = [] for niteration, pair in enumerate(results_per_contig.iteritems()): counter, segment, annotation = pair[0] data = pair[1] nsegments_in_workspace = sum([x.nsegments_in_workspace for x in data]) nsegments_overlapping_annotation = sum([x.observed for x in data]) nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data]) nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data]) basecoverage_intersection = sum( [x.basecoverage_intersection for x in data]) basecoverage_segments = sum([x.basecoverage_segments for x in data]) basecoverage_annotation = sum( [x.basecoverage_annotation for x in data]) basecoverage_workspace = sum([x.basecoverage_workspace for x in data]) fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) if counter.startswith("binom"): pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make( (segment, annotation, "all", counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GREAT_RESULT._fields, description_header, description_width, descriptions) E.Stop()
def buildSegments(options): '''load segments, annotations and workspace from parameters defined in *options*. The workspace will be split by isochores. returns segments, annotations and workspace. ''' options.segment_files = expandGlobs(options.segment_files) options.annotation_files = expandGlobs(options.annotation_files) options.workspace_files = expandGlobs(options.workspace_files) options.sample_files = expandGlobs(options.sample_files) ################################################## # arguments sanity check if not options.segment_files: raise ValueError("please specify at least one segment file") if not options.annotation_files: raise ValueError("please specify at least one annotation file") if not options.workspace_files: raise ValueError("please specify at least one workspace file") # read one or more segment files segments = readSegmentList("segments", options.segment_files, ignore_tracks=options.ignore_segment_tracks) segments.normalize() if segments.sum() == 0: E.critical("no segments in input file - run aborted") raise ValueError("segments file is empty - run aborted") if len(segments) > 1000: raise ValueError("too many (%i) segment files - use track definitions " "or --ignore-segment-tracks" % len(segments)) annotations = readSegmentList( "annotations", options.annotation_files, enable_split_tracks=options.enable_split_tracks, ignore_tracks=options.annotations_label is not None) if options.annotations_label is not None: annotations.setName(options.annotations_label) if options.annotations_to_points: annotations.toPositions(options.annotations_to_points) if options.overlapping_annotations: # only sort, do not merge annotations.sort() else: annotations.normalize() workspaces = readSegmentList("workspaces", options.workspace_files, options, options.enable_split_tracks) workspaces.normalize() # intersect workspaces to build a single workspace E.info("collapsing workspaces") dumpStats(workspaces, "stats_workspaces_input", options) workspaces.collapse() dumpStats(workspaces, "stats_workspaces_collapsed", options) # use merged workspace only, discard others workspaces.restrict("collapsed") # build isochores or intersect annotations/segments with workspace if options.isochore_files: # read one or more isochore files isochores = Engine.IntervalCollection(name="isochores") E.info("%s: reading isochores from %i files" % ("isochores", len(options.isochore_files))) isochores.load(options.isochore_files) dumpStats(isochores, "stats_isochores_raw", options) # merge isochores and check if consistent (fully normalized) isochores.sort() # check that there are no overlapping segments within isochores isochores.check() # TODO: flag is_normalized not properly set isochores.normalize() # check that there are no overlapping segments between isochores # truncate isochores to workspace # crucial if isochores are larger than workspace. isochores.intersect(workspaces["collapsed"]) else: isochores = None return segments, annotations, workspaces, isochores
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = GatEngine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues)] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write( "\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write( "\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def dumpBed(coll, section, options): if section in options.output_bed or \ "all" in options.output_bed or \ len([x for x in options.output_bed if re.search(x, section)]) > 0: coll.save(E.openOutputFile(section + ".bed"))
def dumpStats(coll, section, options): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, section)]) > 0: coll.outputStats(E.openOutputFile(section))
def applyIsochores(segments, annotations, workspaces, options, isochores=None, truncate_segments_to_workspace=False, truncate_workspace_to_annotations=False, restrict_workspace=False, ): '''apply isochores to segments and annotations. Segments and annotations are filtered in place to keep only those overlapping the workspace. If *isochores* are given, isochores are applied. If *truncate_segments_to_workspace*, truncate segments to workspace. If *restrict_workspace* is set, the workspace is confined to those parts that overlap both a segment and an annotation. If *truncate_workspace_to_annotations* is set, the workspace is truncated to keep only those parts that overlap annotations. returns a workspace divided into isochores. ''' if isochores: # intersect isochores and workspaces, segments and annotations # workspace and annotations are truncated # with segments it is optional. E.info("adding isochores to workspace") workspaces.toIsochores(isochores, truncate=True) annotations.toIsochores(isochores, truncate=True) segments.toIsochores( isochores, truncate=options.truncate_segments_to_workspace) if workspaces.sum() == 0: raise ValueError("isochores and workspaces do not overlap") if annotations.sum() == 0: raise ValueError("isochores and annotations do not overlap") if segments.sum() == 0: raise ValueError("isochores and segments do not overlap") dumpStats(workspaces, "stats_workspaces_isochores", options) dumpStats(annotations, "stats_annotations_isochores", options) dumpStats(segments, "stats_segments_isochores", options) dumpBed(workspaces, "workspaces_isochores", options) dumpBed(annotations, "annotations_isochores", options) dumpBed(segments, "segments_isochores", options) else: # intersect workspace and segments/annotations # annotations and segments are truncated by workspace if options.truncate_segments_to_workspace: segments.intersect(workspaces["collapsed"]) else: segments.filter(workspaces["collapsed"]) annotations.intersect(workspaces["collapsed"]) dumpStats(annotations, "stats_annotations_truncated", options) dumpStats(segments, "stats_segments_truncated", options) workspace = workspaces["collapsed"] if restrict_workspace: E.info("restricting workspace") # this is very cumbersome - refactor merge and collapse # to return an IntervalDictionary instead of adding it # to the list of tracks for x in (segments, annotations): if "merged" in segments: workspace.filter(segments["merged"]) else: segments.merge() workspace.filter(segments["merged"]) del segments["merged"] dumpStats(workspaces, "stats_workspaces_restricted", options) if truncate_workspace_to_annotations: E.info("truncating workspace to annotations") annotations.merge() workspace.intersect(annotations["merged"]) del annotations["merged"] dumpStats(workspaces, "stats_workspaces_truncated", options) # segments.dump( open("segments_dump.bed", "w" ) ) # workspaces.dump( open("workspaces_dump.bed", "w" ) ) # output overlap stats # output segment densities per workspace if "overlap" in options.output_stats or \ "all" in options.output_stats: for track in segments.tracks: workspaces.outputOverlapStats(E.openOutputFile("overlap_%s" % track), segments[track]) return workspace
def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--gene-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations - here, location of genes [default=%default].") parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help="filename with segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-g", "--number-of-genes", dest="number_of_genes", type="int", help="total number of genes [default=%default]") parser.add_option("-m", "--annotation-file", dest="annotation_file", type="string", help="filename mapping genes to annotations [default=%default]") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help="ignore segment tracks - all segments belong to one track [default=%default]") parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]") parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.set_defaults( annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], annotation_file=None, num_samples=1000, nbuckets=100000, bucket_size=1, counter="nucleotide-overlap", output_stats=[], output_bed=[], output_filename_counts=None, output_order="fold", cache=None, input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", sampler="annotator", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", number_of_genes=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() # load segments options.segment_files = IO.expandGlobs(options.segment_files) options.annotation_files = IO.expandGlobs(options.annotation_files) options.workspace_files = IO.expandGlobs(options.workspace_files) # read one or more segment files segments = IO.readSegmentList("segments", options.segment_files, options) if options.ignore_segment_tracks: segments.merge(delete=True) E.info("merged all segments into one track with %i segments" % len(segments)) if len(segments) > 1000: raise ValueError( "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments)) # load workspace workspaces = IO.readSegmentList( "workspaces", options.workspace_files, options, options.enable_split_tracks) # intersect workspaces to build a single workspace E.info("collapsing workspaces") workspaces.collapse() # use merged workspace only, discard others workspaces.restrict("collapsed") workspace = workspaces["collapsed"] E.info("intervals loaded in %i seconds" % (time.time() - tstart)) ############################################ # load table mapping a gene id to annotations gene2annotations = IOTools.readMultiMap(IOTools.openFile(options.annotation_file), has_header=True) annotations = set([y for x in gene2annotations.values() for y in x]) E.info("loaded %i annotations for %i genes" % (len(gene2annotations), len(annotations))) ############################################ # load bed file with gene coordinates assert len(options.annotation_files) == 1 indexed_genes = collections.defaultdict(Intersecter) total_genes = 0 # number of genes per contig contig2ngenes = collections.defaultdict(int) # compute number of genes with a particular annotation # per contig annotation2ngenes = collections.defaultdict(int) for line in IOTools.openFile(options.annotation_files[0]): if line.startswith("#"): continue contig, start, end, gene_id = line[:-1].split("\t")[:4] indexed_genes[contig].add_interval( Interval(int(start), int(end), gene_id)) contig2ngenes[contig] += 1 total_genes += 1 try: for annotation in gene2annotations[gene_id]: annotation2ngenes[annotation] += 1 except KeyError: pass E.info("indexed locations for %i contigs" % len(indexed_genes)) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute results E.info("computing counts") results = [] # iterate over segments for segment, segmentdict in segments.iteritems(): # genes hit by segments per annotation genes_hit_by_segments_with_annotations = collections.defaultdict(int) # genes hit by segments genes_hit_by_segments = 0 for contig, ss in segmentdict.iteritems(): for start, end in ss: overlapping_genes = list( indexed_genes[contig].find(start, end)) genes_hit_by_segments += len(overlapping_genes) for x in overlapping_genes: gene_id = x.value try: for annotation in gene2annotations[gene_id]: genes_hit_by_segments_with_annotations[ annotation] += 1 except KeyError: pass # N = number of genes in genome N = total_genes # n = number of genes selected by segments n = genes_hit_by_segments for annotation in annotations: # K = number of genes carrying annotation K = annotation2ngenes[annotation] # k = number of genes selected by segments and with annotation k = genes_hit_by_segments_with_annotations[annotation] if n == 0 or N == 0 or K == 0: expected = 0 fold = 1.0 pvalue = 1.0 else: expected = float(n * K) / N fold = k / expected pvalue = scipy.stats.hypergeom.sf(k - 1, N, K, n) r = GENESET_RESULT._make(( segment, annotation, N, K, n, k, expected, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GENESET_RESULT._fields, description_header, description_width, descriptions) E.Stop()
def outputResults(results, options, header, description_header, description_width, descriptions, format_observed="%i"): '''compute FDR and output results.''' pvalues = [x.pvalue for x in results] ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") qvalues = Engine.getQValues(pvalues, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) try: results = [ x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues) ] is_tuple = True except AttributeError: # not a namedtuple for x, qvalue in zip(results, qvalues): x.qvalue = qvalue x.format_observed = format_observed is_tuple = False counters = set([x.counter for x in results]) for counter in counters: if len(counters) == 1: outfile = options.stdout output = results else: outfilename = re.sub("%s", counter, options.output_tables_pattern) E.info("output for counter %s goes to outfile %s" % (counter, outfilename)) outfile = IOTools.openFile(outfilename, "w") output = [x for x in results if x.counter == counter] outfile.write("\t".join(list(header) + list(description_header)) + "\n") if options.output_order == "track": output.sort(key=lambda x: (x.track, x.annotation)) elif options.output_order == "observed": output.sort(key=lambda x: x.observed) elif options.output_order == "annotation": output.sort(key=lambda x: (x.annotation, x.track)) elif options.output_order == "fold": output.sort(key=lambda x: x.fold) elif options.output_order == "pvalue": output.sort(key=lambda x: x.pvalue) elif options.output_order == "qvalue": output.sort(key=lambda x: x.qvalue) else: raise ValueError("unknown sort order %s" % options.output_order) for result in output: if is_tuple: outfile.write("\t".join(map(str, result))) else: outfile.write(str(result)) if descriptions: try: outfile.write("\t" + "\t".join(descriptions[result.annotation])) except KeyError: outfile.write("\t" + "\t".join([""] * description_width)) outfile.write("\n") if outfile != options.stdout: outfile.close()
def plotResults(results, options): '''plot annotator results.''' ################################################## # plot histograms if options.output_plots_pattern and HASPLOT: def buildPlotFilename(options, key): filename = re.sub("%s", key, options.output_plots_pattern) filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) return filename E.info("plotting sample stats") for r in results: plt.figure() k = [] if r.track != "merged": k.append(r.track) k.append(r.annotation) if r.counter != "na": k.append(r.counter) key = "-".join(k) s = r.samples hist, bins = numpy.histogram(s, bins=100) # plot bars plt.hist(s, bins=100, normed=True, label=key) plt.axvline(r.observed, color='r', linewidth=2) # plot estimated sigma = r.stddev mu = r.expected plt.plot(bins, 1.0 / (sigma * numpy.sqrt(2 * numpy.pi)) * numpy.exp(-(bins - mu)**2 / (2 * sigma**2)), label="std distribution", linewidth=2, color='g') plt.legend() filename = buildPlotFilename(options, key) plt.savefig(filename) E.info("plotting P-value distribution") key = "pvalue" plt.figure() x, bins, y = plt.hist([r.pvalue for r in results], bins=numpy.arange(0, 1.05, 0.025), label="pvalue") plt.hist([r.qvalue for r in results], bins=numpy.arange(0, 1.05, 0.025), label="qvalue", alpha=0.5) plt.legend() # hist, bins = numpy.histogram( \ # [r.pvalue for r in Engine.iterator_results(annotator_results) ], # bins = 20 ) # plt.plot( bins[:-1], hist, label = key ) filename = buildPlotFilename(options, key) plt.savefig(filename)
def plotResults(results, options): '''plot annotator results.''' ################################################## # plot histograms if options.output_plots_pattern and HASPLOT: def buildPlotFilename(options, key): filename = re.sub("%s", key, options.output_plots_pattern) filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) return filename E.info("plotting sample stats") for r in results: plt.figure() k = [] if r.track != "merged": k.append(r.track) k.append(r.annotation) if r.counter != "na": k.append(r.counter) key = "-".join(k) s = r.samples hist, bins = numpy.histogram(s, bins=100) # plot bars plt.hist(s, bins=100, normed=True, label=key) plt.axvline(r.observed, color='r', linewidth=2) # plot estimated sigma = r.stddev mu = r.expected plt.plot(bins, 1.0 / (sigma * numpy.sqrt(2 * numpy.pi)) * numpy.exp(- (bins - mu) ** 2 / (2 * sigma ** 2)), label="std distribution", linewidth=2, color='g') plt.legend() filename = buildPlotFilename(options, key) plt.savefig(filename) E.info("plotting P-value distribution") key = "pvalue" plt.figure() x, bins, y = plt.hist([r.pvalue for r in results], bins=numpy.arange(0, 1.05, 0.025), label="pvalue") plt.hist([r.qvalue for r in results], bins=numpy.arange(0, 1.05, 0.025), label="qvalue", alpha=0.5) plt.legend() # hist, bins = numpy.histogram( \ # [r.pvalue for r in GatEngine.iterator_results(annotator_results) ], # bins = 20 ) # plt.plot( bins[:-1], hist, label = key ) filename = buildPlotFilename(options, key) plt.savefig(filename)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=( "empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", help= "pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a single file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between multiple files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x tracks_a = set(aa.keys()) tracks_b = set(bb.keys()) shared_tracks = tracks_a.intersection(tracks_b) if len(shared_tracks) == 0: E.warn("no shared tracks between {} and {}".format( index1, index2)) for track in sorted(shared_tracks): E.debug("computing results for track {}".format(track)) # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def buildSegments(options): '''load segments, annotations and workspace from parameters defined in *options*. The workspace will be split by isochores. returns segments, annotations and workspace. ''' options.segment_files = expandGlobs(options.segment_files) options.annotation_files = expandGlobs(options.annotation_files) options.workspace_files = expandGlobs(options.workspace_files) options.sample_files = expandGlobs(options.sample_files) ################################################## # arguments sanity check if not options.segment_files: raise ValueError("please specify at least one segment file") if not options.annotation_files: raise ValueError("please specify at least one annotation file") if not options.workspace_files: raise ValueError("please specify at least one workspace file") # read one or more segment files segments = readSegmentList("segments", options.segment_files, options) if options.ignore_segment_tracks: segments.merge(delete=True) E.info("merged all segments into one track with %i segments" % len(segments)) if segments.sum() == 0: E.critical("no segments in input file - run aborted") raise ValueError("segments file is empty - run aborted") if len(segments) > 1000: raise ValueError( "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments)) annotations = readSegmentList( "annotations", options.annotation_files, options, options.enable_split_tracks) workspaces = readSegmentList( "workspaces", options.workspace_files, options, options.enable_split_tracks) # intersect workspaces to build a single workspace E.info("collapsing workspaces") dumpStats(workspaces, "stats_workspaces_input", options) workspaces.collapse() dumpStats(workspaces, "stats_workspaces_collapsed", options) # use merged workspace only, discard others workspaces.restrict("collapsed") # build isochores or intersect annotations/segments with workspace if options.isochore_files: # read one or more isochore files isochores = GatEngine.IntervalCollection(name="isochores") E.info("%s: reading isochores from %i files" % ("isochores", len(options.isochore_files))) isochores.load(options.isochore_files) dumpStats(isochores, "stats_isochores_raw", options) # merge isochores and check if consistent (fully normalized) isochores.sort() # check that there are no overlapping segments within isochores isochores.check() # TODO: flag is_normalized not properly set isochores.normalize() # check that there are no overlapping segments between isochores # truncate isochores to workspace # crucial if isochores are larger than workspace. isochores.intersect(workspaces["collapsed"]) else: isochores = None return segments, annotations, workspaces, isochores
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=("empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--pseudo-count", dest="pseudo_count", type="float", help="pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].") parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x if len(aa.keys()) != 1 or len(bb.keys()) != 1: raise NotImplementedError("multiple segments of interest") track = "merged" # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def applyIsochores( segments, annotations, workspaces, options, isochores=None, truncate_segments_to_workspace=False, truncate_workspace_to_annotations=False, restrict_workspace=False, ): '''apply isochores to segments and annotations. Segments and annotations are filtered in place to keep only those overlapping the workspace. If *isochores* are given, isochores are applied. If *truncate_segments_to_workspace*, truncate segments to workspace. If *restrict_workspace* is set, the workspace is confined to those parts that overlap both a segment and an annotation. If *truncate_workspace_to_annotations* is set, the workspace is truncated to keep only those parts that overlap annotations. returns a workspace divided into isochores. ''' if isochores: # intersect isochores and workspaces, segments and annotations # workspace and annotations are truncated # with segments it is optional. E.info("adding isochores to workspace") workspaces.toIsochores(isochores, truncate=True) annotations.toIsochores(isochores, truncate=True) segments.toIsochores(isochores, truncate=options.truncate_segments_to_workspace) if workspaces.sum() == 0: raise ValueError("isochores and workspaces do not overlap") if annotations.sum() == 0: raise ValueError("isochores and annotations do not overlap") if segments.sum() == 0: raise ValueError("isochores and segments do not overlap") dumpStats(workspaces, "stats_workspaces_isochores", options) dumpStats(annotations, "stats_annotations_isochores", options) dumpStats(segments, "stats_segments_isochores", options) dumpBed(workspaces, "workspaces_isochores", options) dumpBed(annotations, "annotations_isochores", options) dumpBed(segments, "segments_isochores", options) else: # intersect workspace and segments/annotations # annotations and segments are truncated by workspace if options.truncate_segments_to_workspace: segments.intersect(workspaces["collapsed"]) else: segments.filter(workspaces["collapsed"]) annotations.intersect(workspaces["collapsed"]) dumpStats(annotations, "stats_annotations_truncated", options) dumpStats(segments, "stats_segments_truncated", options) workspace = workspaces["collapsed"] if restrict_workspace: E.info("restricting workspace") # this is very cumbersome - refactor merge and collapse # to return an IntervalDictionary instead of adding it # to the list of tracks for x in (segments, annotations): if "merged" in segments: workspace.filter(segments["merged"]) else: segments.merge() workspace.filter(segments["merged"]) del segments["merged"] dumpStats(workspaces, "stats_workspaces_restricted", options) if truncate_workspace_to_annotations: E.info("truncating workspace to annotations") annotations.merge() annotations["merged"].normalize() workspace.intersect(annotations["merged"]) del annotations["merged"] dumpStats(workspaces, "stats_workspaces_truncated", options) # segments.dump( open("segments_dump.bed", "w" ) ) # workspaces.dump( open("workspaces_dump.bed", "w" ) ) # output overlap stats # output segment densities per workspace if "overlap" in options.output_stats or \ "all" in options.output_stats: for track in segments.tracks: workspaces.outputOverlapStats( E.openOutputFile("overlap_%s" % track), segments[track]) return workspace
def computeSample(args): '''compute a single sample. ''' workdata, samples_outfile, metrics_outfile, lock = args (track, sample_id, sampler, segs, annotations, contig_annotations, workspace, contig_workspace, counters) = workdata # E.debug("track=%s, sample=%s - started" % (track, str(sample_id))) counts = E.Counter() sample_id = str(sample_id) outf_samples = samples_outfile if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") samples_outfile.write("track name=%s\n" % sample_id) if lock: outf_samples.close() lock.release() sample = Engine.IntervalDictionary() for isochore in list(segs.keys()): counts.pairs += 1 # skip empty isochores if workspace[isochore].isEmpty or segs[isochore].isEmpty: counts.skipped += 1 continue counts.sampled += 1 r = sampler.sample(segs[isochore], workspace[isochore]) # TODO : activate # self.outputSampleStats( sample_id, isochore, r ) sample.add(isochore, r) # save sample if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") for start, end in r: outf_samples.write("%s\t%i\t%i\n" % (isochore, start, end)) if lock: outf_samples.close() lock.release() # re-combine isochores # adjacent intervals are merged. sample.fromIsochores() if metrics_outfile: if lock: lock.acquire() outf = IOTools.openFile(metrics_outfile, "a") else: outf = metrics_outfile IO.outputMetrics(outf, sample, workspace, track, sample_id) if lock: outf.close() lock.release() counts_per_track = [collections.defaultdict(float) for x in counters] # compute counts for each counter for counter_id, counter in enumerate(counters): # TODO: choose aggregator for annotation in annotations.tracks: counts_per_track[counter_id][annotation] = sum([ counter(sample[contig], contig_annotations[annotation][contig], contig_workspace[contig]) for contig in list(sample.keys())]) # E.debug("track=%s, sample=%s - completed" % (track,str(sample_id ))) return counts_per_track
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = gat.buildParser(usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) ################################################## size_pos, size_segment = GatSegmentList.getSegmentSize() E.debug("sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2 ** (8 * size_pos))) ################################################## # set default counter if not options.counters: options.counters.append("nucleotide-overlap") ################################################## if options.output_tables_pattern != None: if "%s" not in options.output_tables_pattern: raise ValueError( "output_tables_pattern should contain at least one '%s'") if options.output_samples_pattern != None: if "%s" not in options.output_samples_pattern: raise ValueError( "output_samples_pattern should contain at least one '%s'") if options.output_counts_pattern != None: if "%s" not in options.output_counts_pattern: raise ValueError( "output_counts_pattern should contain at least one '%s'") ################################################## # read fold changes that results should be compared with if options.null != "default": if not os.path.exists(options.null): raise OSError("file %s not found" % options.null) E.info("reading reference results from %s" % options.null) options.reference = IO.readAnnotatorResults(options.null) else: options.reference = None if options.input_filename_counts: # use pre-computed counts annotator_results = GatEngine.fromCounts(options.input_filename_counts) elif options.input_filename_results: # use previous results (re-computes fdr) E.info("reading gat results from %s" % options.input_filename_results) annotator_results = IO.readAnnotatorResults( options.input_filename_results) else: # do full gat analysis annotator_results = fromSegments(options, args) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## # output IO.outputResults(annotator_results, options, GatEngine.AnnotatorResultExtended.headers, description_header, description_width, descriptions) IO.plotResults(annotator_results, options) # write footer and output benchmark information. E.Stop()
def sample(self, track, counts, counters, segs, annotations, workspace, outfiles): '''conditional sampling - sample using only those segments that contain both a segment and an annotation. return dictionary with counts per track ''' E.info("performing conditional sampling") counts_per_track = [collections.defaultdict(list) for x in counters] # rebuild non-isochore annotations and workspace contig_annotations = annotations.clone() contig_annotations.fromIsochores() contig_annotations.setName("contig_" + annotations.getName()) contig_workspace = workspace.clone() contig_workspace.fromIsochores() E.info("setting up shared data for multi-processing") annotations.share() contig_annotations.share() contig_workspace.share("contig_workspace") E.info("workspace without conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) if workspace.sum() == 0: E.warn("empty workspace - no computation performed") return None # compute samples conditionally - need to proceed by annotation for annoid, annotation in enumerate(annotations.tracks): annos = annotations[annotation] temp_segs, temp_annotations, temp_workspace = \ self.workspace_generator(segs, annos, workspace) # set up sharing temp_segs.share("generated_segments") temp_workspace.share("generated_workspace") E.info("workspace for annotation %s: %i segments, %i nucleotides" % (annotation, temp_workspace.counts(), temp_workspace.sum())) work = [WorkData('_'.join((track, annoid)), x, self.sampler, temp_segs, annotations, contig_annotations, temp_workspace, contig_workspace, counters, ) for x in range(self.num_samples)] E.info("sampling for annotation '%s' started" % annotation) results = self.computeSamples(work) E.info("sampling for annotation '%s' completed" % annotation) for result in results: for counter_id, counter in enumerate(counters): counts_per_track[counter_id][annotation].append( result[counter_id][annotation]) return counts_per_track
def fromSegments(options, args): '''run analysis from segment files. This is the most common use case. ''' tstart = time.time() # build segments segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # open various additional output files outfiles = {} for section in ( "sample", "segment_metrics", "sample_metrics", ): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, "section")]) > 0: outfiles[section] = E.openOutputFile(section) if 'sample_metrics' in outfiles: outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores, truncate_segments_to_workspace=options.truncate_segments_to_workspace, truncate_workspace_to_annotations=options. truncate_workspace_to_annotations, restrict_workspace=options.restrict_workspace) # check memory requirements # previous algorithm: memory requirements if all samples are stored # counts = segments.countsPerTrack() # max_counts = max(counts.values()) # memory = 8 * 2 * options.num_samples * max_counts * len(workspace) # initialize sampler if options.sampler == "annotator": sampler = Engine.SamplerAnnotator(bucket_size=options.bucket_size, nbuckets=options.nbuckets) elif options.sampler == "shift": sampler = Engine.SamplerShift(radius=options.shift_expansion, extension=options.shift_extension) elif options.sampler == "segments": sampler = Engine.SamplerSegments() elif options.sampler == "local-permutation": sampler = Engine.SamplerLocalPermutation() elif options.sampler == "global-permutation": sampler = Engine.SamplerGlobalPermutation() elif options.sampler == "brute-force": sampler = Engine.SamplerBruteForce() elif options.sampler == "uniform": sampler = Engine.SamplerUniform() # initialize counter counters = [] for counter in options.counters: if counter == "nucleotide-overlap": counters.append(Engine.CounterNucleotideOverlap()) elif counter == "nucleotide-density": counters.append(Engine.CounterNucleotideDensity()) elif counter == "segment-overlap": counters.append(Engine.CounterSegmentOverlap()) elif counter == "annotation-overlap": counters.append(Engine.CounterAnnotationOverlap()) elif counter == "segment-midoverlap": counters.append(Engine.CounterSegmentMidpointOverlap()) elif counter == "annotation-midoverlap": counters.append(Engine.CounterAnnotationMidpointOverlap()) else: raise ValueError("unknown counter '%s'" % counter) # initialize workspace generator if options.conditional == "unconditional": workspace_generator = Engine.UnconditionalWorkspace() elif options.conditional == "cooccurance": workspace_generator = Engine.ConditionalWorkspaceCooccurance() elif options.conditional == "annotation-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, options.conditional_expansion) elif options.conditional == "segment-centered": if options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = Engine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, options.conditional_expansion) else: raise ValueError("unknown conditional workspace '%s'" % options.conditional) # check if reference is compplete if options.reference: for track in segments.tracks: if track not in options.reference: raise ValueError("missing track '%s' in reference" % track) r = options.reference[track] for annotation in annotations.tracks: if annotation not in r: raise ValueError( "missing annotation '%s' in annotations for " "track='%s'" % (annotation, track)) # compute annotator_results = gat.run( segments, annotations, workspace, sampler, counters, workspace_generator=workspace_generator, num_samples=options.num_samples, cache=options.cache, outfiles=outfiles, output_counts_pattern=options.output_counts_pattern, output_samples_pattern=options.output_samples_pattern, sample_files=options.sample_files, conditional=options.conditional, conditional_extension=options.conditional_extension, reference=options.reference, pseudo_count=options.pseudo_count, num_threads=options.num_threads) return annotator_results
def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations [default=%default].") parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help="filename with segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", help="filename with isochore segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help="ignore segment tracks - all segments belong to one track [default=%default]") parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]") parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.add_option("--restrict-workspace", dest="restrict_workspace", action="store_true", help="restrict workspace to those segments that contain both track" " and annotations [default=%default]") parser.add_option("--counter", dest="counters", type="choice", action="append", choices=("binom", "hyperg"), help="counter to use [default=%default].") parser.add_option("--output-tables-pattern", dest="output_tables_pattern", type="string", help="output pattern for result tables. Used if there are multiple counters used [default=%default].") parser.set_defaults( annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], counters=[], output_stats=[], output_bed=[], output_tables_pattern="%s.tsv.gz", output_order="fold", input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", truncate_workspace_to_annotations=False, truncate_segments_to_workspace=False ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() if len(options.counters) == 0: options.counters.append("binom") ############################################ segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute per contig # compute bases covered by workspace workspace2basecoverage, isochores = {}, [] for contig, ww in workspace.iteritems(): workspace2basecoverage[contig] = ww.sum() isochores.append(contig) # compute percentage of bases covered by annotations in workspace # per isochore annotation2basecoverage = collections.defaultdict(dict) for annotation, aa in annotations.iteritems(): for isochore, a in aa.iteritems(): # need to truncate to workspace? annotation2basecoverage[annotation][isochore] = a.sum() results_per_contig = collections.defaultdict(list) E.info("computing counts per isochore") # results per isochore def emptyResult(segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace): return GREAT_RESULT._make(( segment, annotation, isochore, counter, 0, # observed 0, # expected nsegments_in_workspace, 0, # nannotations_in_workspace 0, # nsegments_overlapping_annotation 0, # nannotations_overlapping_segments 0, # basecoverage_intersection 0, # basecoverage_segments basecoverage_annotation, basecoverage_workspace, 0.0, 1.0, 1.0, 1.0)) for isochore in isochores: basecoverage_workspace = workspace2basecoverage[isochore] # iterate over all isochores for segment, segmentdict in segments.iteritems(): try: ss = segmentdict[isochore] # select segments overlapping workspace segments_in_workspace = GatSegmentList.SegmentList(clone=ss) segments_in_workspace.intersect(workspace[isochore]) # number of segments in workspace nsegments_in_workspace = len(segments_in_workspace) except KeyError: ss = None basecoverage_segments = segments_in_workspace.sum() for annotation, annotationdict in annotations.iteritems(): # if annotation != "GO:0030957": continue try: aa = annotationdict[isochore] except KeyError: aa = None # p_A: proportion of bases covered by annotation try: basecoverage_annotation = annotation2basecoverage[ annotation][isochore] except KeyError: basecoverage_annotation = 0 if ss == None or aa == None: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append(emptyResult(segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue # select segments overlapping annotation segments_overlapping_annotation = GatSegmentList.SegmentList( clone=ss) segments_overlapping_annotation.intersect( annotations[annotation][isochore]) # number of segments in annotation nsegments_overlapping_annotation = ss.intersectionWithSegments(annotations[annotation][isochore], mode=options.overlap_mode) # number of nucleotides at the intersection of segments, # annotation and workspace basecoverage_intersection = segments_overlapping_annotation.sum() annotations_overlapping_segments = GatSegmentList.SegmentList( clone=aa) annotations_overlapping_segments.intersect(ss) nannotations_overlapping_segments = len( annotations_overlapping_segments) nannotations_in_workspace = len(aa) if nannotations_in_workspace == 0: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append(emptyResult(segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) fraction_hit_annotation = float( nannotations_overlapping_segments) / nannotations_in_workspace for counter in options.counters: if counter.startswith("binom"): # GREAT binomial probability over "regions" # n = number of genomic regions = nannotations_in_workspace # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation # kpi = genomic regions with annotation hit by segments = nannotations_in_segments # sf = survival functions = 1 -cdf # probability of observing >kpi in a sample of n where the probabily of succes is # ppi. pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * \ nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): # hypergeometric probability over nucleotides # Sampling without replacement # x,M,n,M # x = observed number of nucleotides in overlap of segments,annotations and workspace # M = number of nucleotides in workspace # n = number of nucleotides in annotations (and workspace) # N = number of nucleotides in segments (and workspace) # P-value of obtaining >x number of nucleotides # overlapping. rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make(( segment, annotation, isochore, counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) # print "\t".join( map(str, r)) results_per_contig[ (counter, segment, annotation)].append(r) E.info("merging counts per isochore") # compute sums results = [] for niteration, pair in enumerate(results_per_contig.iteritems()): counter, segment, annotation = pair[0] data = pair[1] nsegments_in_workspace = sum([x.nsegments_in_workspace for x in data]) nsegments_overlapping_annotation = sum([x.observed for x in data]) nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data]) nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data]) basecoverage_intersection = sum( [x.basecoverage_intersection for x in data]) basecoverage_segments = sum([x.basecoverage_segments for x in data]) basecoverage_annotation = sum( [x.basecoverage_annotation for x in data]) basecoverage_workspace = sum([x.basecoverage_workspace for x in data]) fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) if counter.startswith("binom"): pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make(( segment, annotation, "all", counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GREAT_RESULT._fields, description_header, description_width, descriptions) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-l", "--sample-file", dest="sample_files", type="string", action="append", help="filename with sample files. Start processing from samples [default=%default].") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=("empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option("--results-file", dest="input_filename_results", type="string", help="start processing from results - no segments required [default=%default].") parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.add_option("--output-samples-pattern", dest="output_samples_pattern", type="string", help="output pattern for samples. Samples are stored in bed format, one for " " each segment [default=%default]") parser.add_option("--plots", dest="plots", type="choice", choices=("all", "bars-per-track", "bars", ), help="plots to be created [default=%default].") parser.set_defaults( sample_files=[], num_samples=1000, output_stats=[], output_filename_counts=None, output_order="fold", input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, plots=[], ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) annotator_results = IO.readAnnotatorResults(options.input_filename_results) if "speparate-bars" in options.plots: plotBarplots(annotator_results, options) if "bars" in options.plots: plotBarplot(annotator_results, options) # write footer and output benchmark information. E.Stop()
def fromSegments(options, args): '''run analysis from segment files. This is the most common use case. ''' tstart = time.time() ################################################## ################################################## ################################################## # build segments segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) ################################################## ################################################## ################################################## # open various additional output files ################################################## outfiles = {} for section in ("sample", "segment_metrics", "sample_metrics", ): if section in options.output_stats or \ "all" in options.output_stats or \ len([x for x in options.output_stats if re.search(x, "section")]) > 0: outfiles[section] = E.openOutputFile(section) if 'sample_metrics' in outfiles: outfiles['sample_metrics'].write( "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) # filter segments by workspace workspace = IO.applyIsochores( segments, annotations, workspaces, options, isochores, truncate_segments_to_workspace=options.truncate_segments_to_workspace, truncate_workspace_to_annotations=options.truncate_workspace_to_annotations, restrict_workspace=options.restrict_workspace) ################################################## ################################################## ################################################## # check memory requirements counts = segments.countsPerTrack() max_counts = max(counts.values()) # previous algorithm: memory requirements if all samples are stored memory = 8 * 2 * options.num_samples * max_counts * len(workspace) ################################################## ################################################## ################################################## # initialize sampler if options.sampler == "annotator": sampler = GatEngine.SamplerAnnotator( bucket_size=options.bucket_size, nbuckets=options.nbuckets) elif options.sampler == "shift": sampler = GatEngine.SamplerShift( radius=options.shift_expansion, extension=options.shift_extension) elif options.sampler == "segments": sampler = GatEngine.SamplerSegments() elif options.sampler == "local-permutation": sampler = GatEngine.SamplerLocalPermutation() elif options.sampler == "global-permutation": sampler = GatEngine.SamplerGlobalPermutation() elif options.sampler == "brute-force": sampler = GatEngine.SamplerBruteForce() elif options.sampler == "uniform": sampler = GatEngine.SamplerUniform() ################################################## ################################################## ################################################## # initialize counter counters = [] for counter in options.counters: if counter == "nucleotide-overlap": counters.append(GatEngine.CounterNucleotideOverlap()) elif counter == "nucleotide-density": counters.append(GatEngine.CounterNucleotideDensity()) elif counter == "segment-overlap": counters.append(GatEngine.CounterSegmentOverlap()) elif counter == "annotations-overlap": counters.append(GatEngine.CounterAnnotationsOverlap()) elif counter == "segment-midoverlap": counters.append(GatEngine.CounterSegmentMidpointOverlap()) elif counter == "annotations-midoverlap": counters.append(GatEngine.CounterAnnotationsMidpointOverlap()) else: raise ValueError("unknown counter '%s'" % counter) ################################################## ################################################## ################################################## # initialize workspace generator if options.conditional == "unconditional": workspace_generator = GatEngine.UnconditionalWorkspace() elif options.conditional == "cooccurance": workspace_generator = GatEngine.ConditionalWorkspaceCooccurance() elif options.conditional == "annotation-centered": if options.conditional_extension == options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered( options.conditional_extension, options.conditional_expansion) elif options.conditional == "segment-centered": if options.conditional_extension == options.conditional_expansion is None: raise ValueError( "please specify either --conditional-expansion or " "--conditional-extension") workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered( options.conditional_extension, options.conditional_expansion) else: raise ValueError("unknown conditional workspace '%s'" % options.conditional) ################################################## ################################################## ################################################## # check if reference is compplete ################################################## if options.reference: for track in segments.tracks: if track not in options.reference: raise ValueError("missing track '%s' in reference" % track) r = options.reference[track] for annotation in annotations.tracks: if annotation not in r: raise ValueError( "missing annotation '%s' in annotations for " "track='%s'" % (annotation, track)) ################################################## ################################################## ################################################## # compute ################################################## annotator_results = gat.run( segments, annotations, workspace, sampler, counters, workspace_generator=workspace_generator, num_samples=options.num_samples, cache=options.cache, outfiles=outfiles, output_counts_pattern=options.output_counts_pattern, output_samples_pattern=options.output_samples_pattern, sample_files=options.sample_files, conditional=options.conditional, conditional_extension=options.conditional_extension, reference=options.reference, pseudo_count=options.pseudo_count, num_threads=options.num_threads) return annotator_results
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = gat.buildParser(usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) ################################################## size_pos, size_segment = SegmentList.getSegmentSize() E.debug("sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos))) ################################################## # set default counter if not options.counters: options.counters.append("nucleotide-overlap") ################################################## if options.output_tables_pattern is not None: if "%s" not in options.output_tables_pattern: raise ValueError( "output_tables_pattern should contain at least one '%s'") if options.output_samples_pattern is not None: if "%s" not in options.output_samples_pattern: raise ValueError( "output_samples_pattern should contain at least one '%s'") if options.output_counts_pattern is not None: if "%s" not in options.output_counts_pattern: raise ValueError( "output_counts_pattern should contain at least one '%s'") if options.random_seed is not None: # initialize python random number generator random.seed(options.random_seed) # initialize numpy random number generator numpy.random.seed(options.random_seed) ################################################## # read fold changes that results should be compared with if options.null != "default": if not os.path.exists(options.null): raise OSError("file %s not found" % options.null) E.info("reading reference results from %s" % options.null) options.reference = IO.readAnnotatorResults(options.null) else: options.reference = None if options.input_filename_counts: # use pre-computed counts annotator_results = Engine.fromCounts(options.input_filename_counts) elif options.input_filename_results: # use previous results (re-computes fdr) E.info("reading gat results from %s" % options.input_filename_results) annotator_results = IO.readAnnotatorResults( options.input_filename_results) else: # do full gat analysis annotator_results = fromSegments(options, args) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) Engine.updatePValues(annotator_results, options.pvalue_method) ################################################## # output IO.outputResults(annotator_results, options, Engine.AnnotatorResultExtended.headers, description_header, description_width, descriptions) IO.plotResults(annotator_results, options) # write footer and output benchmark information. E.Stop()