def run(segments, annotations, workspace, sampler, counters, workspace_generator, **kwargs): '''run an enrichment analysis. segments: an IntervalCollection workspace: an IntervalCollection annotations: an IntervalCollection kwargs recognized are: cache filename of cache num_samples number of samples to compute output_counts_pattern output counts to filename output_samples_pattern if given, output samles to these files, one per segment sample_files if given, read samples from these files. fdr method to compute qvalues outfiles dictionary of optional additional output files. pseudo_count pseudo_count to add to observed and expected values reference data with reference observed and expected values. ''' # get arguments num_samples = kwargs.get("num_samples", 10000) cache = kwargs.get("cache", None) output_counts_pattern = kwargs.get("output_counts_pattern", None) sample_files = kwargs.get("sample_files", []) pseudo_count = kwargs.get("pseudo_count", 1.0) reference = kwargs.get("reference", None) output_samples_pattern = kwargs.get("output_samples_pattern", None) outfiles = kwargs.get("outfiles", {}) num_threads = kwargs.get("num_threads", 0) ################################################## ################################################## ################################################## # computing summary metrics for segments if "segment_metrics" in outfiles: E.info("computing summary metrics for segments") outfile = outfiles["segment_metrics"] outfile.write("track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) for track in segments.tracks: IO.outputMetrics(outfile, segments[track], workspace, track, 'segments', ) E.info("wrote summary metrics for segments to %s" % str(outfile)) ################################################## ################################################## ################################################## # collect observed counts from segments E.info("collecting observed counts") observed_counts = [] for counter in counters: observed_counts.append(Engine.computeCounts( counter=counter, aggregator=sum, segments=segments, annotations=annotations, workspace=workspace, workspace_generator=workspace_generator)) ################################################## ################################################## ################################################## # sample and collect counts ################################################## E.info("starting sampling") if cache: E.info("samples are cached in %s" % cache) samples = Engine.SamplesCached(filename=cache) elif sample_files: if not output_samples_pattern: raise ValueError( "require output_samples_pattern if loading samples from files") # build regex regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern)) E.info("loading samples from %i files" % len(sample_files)) samples = Engine.SamplesFile( filenames=sample_files, regex=regex) else: samples = Engine.Samples() sampled_counts = {} counts = E.Counter() ntracks = len(segments.tracks) for ntrack, track in enumerate(segments.tracks): segs = segments[track] E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks)) if output_samples_pattern and not sample_files: filename = re.sub("%s", track, output_samples_pattern) E.debug("saving samples to %s" % filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if filename.endswith(".gz"): samples_outfile = gzip.open(filename, "w") else: samples_outfile = open(filename, "w") else: samples_outfile = None if workspace_generator.is_conditional: outer_sampler = ConditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) else: outer_sampler = UnconditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) counts_per_track = outer_sampler.sample( track, counts, counters, segs, annotations, workspace, outfiles) # skip empty tracks if counts_per_track is None: continue if samples_outfile: samples_outfile.close() sampled_counts[track] = counts_per_track # old code, refactor into loop to save samples if 0: E.info("sampling stats: %s" % str(counts)) if track not in samples: E.warn("no samples for track %s" % track) continue # clean up samples del samples[track] E.info("sampling finished") # build annotator results E.info("computing PValue statistics") annotator_results = list() counter_id = 0 for counter, observed_count in zip(counters, observed_counts): for track, r in observed_count.items(): for annotation, observed in r.items(): temp_segs, temp_annos, temp_workspace = workspace_generator( segments[track], annotations[annotation], workspace) # ignore empty results if temp_workspace.sum() == 0: continue # if reference is given, p-value will indicate difference # The test that track and annotation are present is done # elsewhere if reference: ref = reference[track][annotation] else: ref = None annotator_results.append(Engine.AnnotatorResultExtended( track=track, annotation=annotation, counter=counter.name, observed=observed, samples=sampled_counts[track][counter_id][annotation], track_segments=temp_segs, annotation_segments=temp_annos, workspace=temp_workspace, reference=ref, pseudo_count=pseudo_count)) counter_id += 1 # dump (large) table with counts if output_counts_pattern: for counter in counters: name = counter.name filename = re.sub("%s", name, output_counts_pattern) E.info("writing counts to %s" % filename) output = [x for x in annotator_results if x.counter == name] outfile = IOTools.openFile(filename, "w") outfile.write("track\tannotation\tobserved\tcounts\n") for o in output: outfile.write("%s\t%s\t%i\t%s\n" % (o.track, o.annotation, o.observed, ",".join(["%i" % x for x in o.samples]))) return annotator_results
def sample(self, track, counts, counters, segs, annotations, workspace, outfiles): '''conditional sampling - sample using only those segments that contain both a segment and an annotation. return dictionary with counts per track ''' E.info("performing conditional sampling") counts_per_track = [collections.defaultdict(list) for x in counters] # rebuild non-isochore annotations and workspace contig_annotations = annotations.clone() contig_annotations.fromIsochores() contig_annotations.setName("contig_" + annotations.getName()) contig_workspace = workspace.clone() contig_workspace.fromIsochores() E.info("setting up shared data for multi-processing") annotations.share() contig_annotations.share() contig_workspace.share("contig_workspace") E.info("workspace without conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) if workspace.sum() == 0: E.warn("empty workspace - no computation performed") return None # compute samples conditionally - need to proceed by annotation for annoid, annotation in enumerate(annotations.tracks): annos = annotations[annotation] temp_segs, temp_annotations, temp_workspace = \ self.workspace_generator(segs, annos, workspace) # set up sharing temp_segs.share("generated_segments") temp_workspace.share("generated_workspace") E.info("workspace for annotation %s: %i segments, %i nucleotides" % (annotation, temp_workspace.counts(), temp_workspace.sum())) work = [WorkData('_'.join((track, annoid)), x, self.sampler, temp_segs, annotations, contig_annotations, temp_workspace, contig_workspace, counters, ) for x in range(self.num_samples)] E.info("sampling for annotation '%s' started" % annotation) results = self.computeSamples(work) E.info("sampling for annotation '%s' completed" % annotation) for result in results: for counter_id, counter in enumerate(counters): counts_per_track[counter_id][annotation].append( result[counter_id][annotation]) return counts_per_track
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=("empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--pseudo-count", dest="pseudo_count", type="float", help="pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].") parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a single file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between multiple files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x tracks_a = set(aa.keys()) tracks_b = set(bb.keys()) shared_tracks = tracks_a.intersection(tracks_b) if len(shared_tracks) == 0: E.warn("no shared tracks between {} and {}".format( index1, index2)) for track in sorted(shared_tracks): E.debug("computing results for track {}".format(track)) # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def sample(self, track, counts, counters, segs, annotations, workspace, outfiles): '''sample and return counts. Return a list of counted results for each counter. ''' E.info("performing unconditional sampling") counts_per_track = [collections.defaultdict(list) for x in counters] # rebuild non-isochore annotations and workspace contig_annotations = annotations.clone() contig_annotations.fromIsochores() contig_annotations.setName("contig_" + annotations.getName()) contig_workspace = workspace.clone() contig_workspace.fromIsochores() E.info("workspace without conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) temp_segs, _, temp_workspace = self.workspace_generator( segs, None, workspace) E.info("workspace after conditioning: %i segments, %i nucleotides" % (workspace.counts(), workspace.sum())) if workspace.sum() == 0: E.warn("empty workspace - no computation performed") return None work = [WorkData(track, x, self.sampler, temp_segs, annotations, contig_annotations, temp_workspace, contig_workspace, counters, ) for x in range(self.num_samples)] if self.num_threads > 0: E.info("setting up shared data for multi-processing") annotations.share() contig_annotations.share() contig_workspace.share("contig_workspace") temp_segs.share("generated_segments") temp_workspace.share("generated_workspace") E.info("sampling started") results = self.computeSamples(work) E.info("sampling completed") if self.num_threads > 0: E.info("retrieving private data") annotations.unshare() contig_annotations.unshare() contig_workspace.unshare() temp_segs.unshare() temp_workspace.unshare() # collate results for result in results: for counter_id, counter in enumerate(counters): for annotation in annotations.tracks: counts_per_track[counter_id][annotation].append( result[counter_id][annotation]) self.outputSampleStats(None, "", []) return counts_per_track
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=( "empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", help= "pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a single file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between multiple files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x tracks_a = set(aa.keys()) tracks_b = set(bb.keys()) shared_tracks = tracks_a.intersection(tracks_b) if len(shared_tracks) == 0: E.warn("no shared tracks between {} and {}".format( index1, index2)) for track in sorted(shared_tracks): E.debug("computing results for track {}".format(track)) # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()