def run(segments, annotations, workspace, sampler, counters, workspace_generator, **kwargs): '''run an enrichment analysis. segments: an IntervalCollection workspace: an IntervalCollection annotations: an IntervalCollection kwargs recognized are: cache filename of cache num_samples number of samples to compute output_counts_pattern output counts to filename output_samples_pattern if given, output samles to these files, one per segment sample_files if given, read samples from these files. fdr method to compute qvalues outfiles dictionary of optional additional output files. pseudo_count pseudo_count to add to observed and expected values reference data with reference observed and expected values. ''' # get arguments num_samples = kwargs.get("num_samples", 10000) cache = kwargs.get("cache", None) output_counts_pattern = kwargs.get("output_counts_pattern", None) sample_files = kwargs.get("sample_files", []) pseudo_count = kwargs.get("pseudo_count", 1.0) reference = kwargs.get("reference", None) output_samples_pattern = kwargs.get("output_samples_pattern", None) outfiles = kwargs.get("outfiles", {}) num_threads = kwargs.get("num_threads", 0) ################################################## ################################################## ################################################## # computing summary metrics for segments if "segment_metrics" in outfiles: E.info("computing summary metrics for segments") outfile = outfiles["segment_metrics"] outfile.write("track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders())) for track in segments.tracks: IO.outputMetrics(outfile, segments[track], workspace, track, 'segments', ) E.info("wrote summary metrics for segments to %s" % str(outfile)) ################################################## ################################################## ################################################## # collect observed counts from segments E.info("collecting observed counts") observed_counts = [] for counter in counters: observed_counts.append(Engine.computeCounts( counter=counter, aggregator=sum, segments=segments, annotations=annotations, workspace=workspace, workspace_generator=workspace_generator)) ################################################## ################################################## ################################################## # sample and collect counts ################################################## E.info("starting sampling") if cache: E.info("samples are cached in %s" % cache) samples = Engine.SamplesCached(filename=cache) elif sample_files: if not output_samples_pattern: raise ValueError( "require output_samples_pattern if loading samples from files") # build regex regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern)) E.info("loading samples from %i files" % len(sample_files)) samples = Engine.SamplesFile( filenames=sample_files, regex=regex) else: samples = Engine.Samples() sampled_counts = {} counts = E.Counter() ntracks = len(segments.tracks) for ntrack, track in enumerate(segments.tracks): segs = segments[track] E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks)) if output_samples_pattern and not sample_files: filename = re.sub("%s", track, output_samples_pattern) E.debug("saving samples to %s" % filename) dirname = os.path.dirname(filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if filename.endswith(".gz"): samples_outfile = gzip.open(filename, "w") else: samples_outfile = open(filename, "w") else: samples_outfile = None if workspace_generator.is_conditional: outer_sampler = ConditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) else: outer_sampler = UnconditionalSampler(num_samples, samples, samples_outfile, sampler, workspace_generator, counters, outfiles, num_threads=num_threads) counts_per_track = outer_sampler.sample( track, counts, counters, segs, annotations, workspace, outfiles) # skip empty tracks if counts_per_track is None: continue if samples_outfile: samples_outfile.close() sampled_counts[track] = counts_per_track # old code, refactor into loop to save samples if 0: E.info("sampling stats: %s" % str(counts)) if track not in samples: E.warn("no samples for track %s" % track) continue # clean up samples del samples[track] E.info("sampling finished") # build annotator results E.info("computing PValue statistics") annotator_results = list() counter_id = 0 for counter, observed_count in zip(counters, observed_counts): for track, r in observed_count.items(): for annotation, observed in r.items(): temp_segs, temp_annos, temp_workspace = workspace_generator( segments[track], annotations[annotation], workspace) # ignore empty results if temp_workspace.sum() == 0: continue # if reference is given, p-value will indicate difference # The test that track and annotation are present is done # elsewhere if reference: ref = reference[track][annotation] else: ref = None annotator_results.append(Engine.AnnotatorResultExtended( track=track, annotation=annotation, counter=counter.name, observed=observed, samples=sampled_counts[track][counter_id][annotation], track_segments=temp_segs, annotation_segments=temp_annos, workspace=temp_workspace, reference=ref, pseudo_count=pseudo_count)) counter_id += 1 # dump (large) table with counts if output_counts_pattern: for counter in counters: name = counter.name filename = re.sub("%s", name, output_counts_pattern) E.info("writing counts to %s" % filename) output = [x for x in annotator_results if x.counter == name] outfile = IOTools.openFile(filename, "w") outfile.write("track\tannotation\tobserved\tcounts\n") for o in output: outfile.write("%s\t%s\t%i\t%s\n" % (o.track, o.annotation, o.observed, ",".join(["%i" % x for x in o.samples]))) return annotator_results
def computeSample(args): '''compute a single sample. ''' workdata, samples_outfile, metrics_outfile, lock = args (track, sample_id, sampler, segs, annotations, contig_annotations, workspace, contig_workspace, counters) = workdata # E.debug("track=%s, sample=%s - started" % (track, str(sample_id))) counts = E.Counter() sample_id = str(sample_id) outf_samples = samples_outfile if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") samples_outfile.write("track name=%s\n" % sample_id) if lock: outf_samples.close() lock.release() sample = Engine.IntervalDictionary() for isochore in list(segs.keys()): counts.pairs += 1 # skip empty isochores if workspace[isochore].isEmpty or segs[isochore].isEmpty: counts.skipped += 1 continue counts.sampled += 1 r = sampler.sample(segs[isochore], workspace[isochore]) # TODO : activate # self.outputSampleStats( sample_id, isochore, r ) sample.add(isochore, r) # save sample if samples_outfile: if lock: lock.acquire() outf_samples = IOTools.openFile(samples_outfile, "a") for start, end in r: outf_samples.write("%s\t%i\t%i\n" % (isochore, start, end)) if lock: outf_samples.close() lock.release() # re-combine isochores # adjacent intervals are merged. sample.fromIsochores() if metrics_outfile: if lock: lock.acquire() outf = IOTools.openFile(metrics_outfile, "a") else: outf = metrics_outfile IO.outputMetrics(outf, sample, workspace, track, sample_id) if lock: outf.close() lock.release() counts_per_track = [collections.defaultdict(float) for x in counters] # compute counts for each counter for counter_id, counter in enumerate(counters): # TODO: choose aggregator for annotation in annotations.tracks: counts_per_track[counter_id][annotation] = sum([ counter(sample[contig], contig_annotations[annotation][contig], contig_workspace[contig]) for contig in list(sample.keys())]) # E.debug("track=%s, sample=%s - completed" % (track,str(sample_id ))) return counts_per_track