Example #1
0
def run(segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator,
        **kwargs):
    '''run an enrichment analysis.

    segments: an IntervalCollection
    workspace: an IntervalCollection
    annotations: an IntervalCollection

    kwargs recognized are:

    cache
       filename of cache

    num_samples
       number of samples to compute

    output_counts_pattern
       output counts to filename

    output_samples_pattern
       if given, output samles to these files, one per segment

    sample_files
       if given, read samples from these files.

    fdr
       method to compute qvalues

    outfiles
       dictionary of optional additional output files.

    pseudo_count
       pseudo_count to add to observed and expected values

    reference
       data with reference observed and expected values.
    '''

    # get arguments
    num_samples = kwargs.get("num_samples", 10000)
    cache = kwargs.get("cache", None)
    output_counts_pattern = kwargs.get("output_counts_pattern", None)
    sample_files = kwargs.get("sample_files", [])
    pseudo_count = kwargs.get("pseudo_count", 1.0)
    reference = kwargs.get("reference", None)
    output_samples_pattern = kwargs.get("output_samples_pattern", None)
    outfiles = kwargs.get("outfiles", {})
    num_threads = kwargs.get("num_threads", 0)

    ##################################################
    ##################################################
    ##################################################
    # computing summary metrics for segments
    if "segment_metrics" in outfiles:
        E.info("computing summary metrics for segments")
        outfile = outfiles["segment_metrics"]
        outfile.write("track\tsection\tmetric\t%s\n" %
                      "\t".join(Stats.Summary().getHeaders()))
        for track in segments.tracks:
            IO.outputMetrics(outfile,
                             segments[track],
                             workspace,
                             track,
                             'segments',
                             )
        E.info("wrote summary metrics for segments to %s" % str(outfile))

    ##################################################
    ##################################################
    ##################################################
    # collect observed counts from segments
    E.info("collecting observed counts")
    observed_counts = []
    for counter in counters:
        observed_counts.append(Engine.computeCounts(
            counter=counter,
            aggregator=sum,
            segments=segments,
            annotations=annotations,
            workspace=workspace,
            workspace_generator=workspace_generator))

    ##################################################
    ##################################################
    ##################################################
    # sample and collect counts
    ##################################################
    E.info("starting sampling")

    if cache:
        E.info("samples are cached in %s" % cache)
        samples = Engine.SamplesCached(filename=cache)
    elif sample_files:
        if not output_samples_pattern:
            raise ValueError(
                "require output_samples_pattern if loading samples from files")
        # build regex
        regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern))
        E.info("loading samples from %i files" % len(sample_files))
        samples = Engine.SamplesFile(
            filenames=sample_files,
            regex=regex)
    else:
        samples = Engine.Samples()

    sampled_counts = {}

    counts = E.Counter()

    ntracks = len(segments.tracks)

    for ntrack, track in enumerate(segments.tracks):

        segs = segments[track]

        E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks))

        if output_samples_pattern and not sample_files:
            filename = re.sub("%s", track, output_samples_pattern)
            E.debug("saving samples to %s" % filename)
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
            if filename.endswith(".gz"):
                samples_outfile = gzip.open(filename, "w")
            else:
                samples_outfile = open(filename, "w")
        else:
            samples_outfile = None

        if workspace_generator.is_conditional:
            outer_sampler = ConditionalSampler(num_samples,
                                               samples,
                                               samples_outfile,
                                               sampler,
                                               workspace_generator,
                                               counters,
                                               outfiles,
                                               num_threads=num_threads)
        else:
            outer_sampler = UnconditionalSampler(num_samples,
                                                 samples,
                                                 samples_outfile,
                                                 sampler,
                                                 workspace_generator,
                                                 counters,
                                                 outfiles,
                                                 num_threads=num_threads)

        counts_per_track = outer_sampler.sample(
            track, counts, counters, segs, annotations, workspace, outfiles)

        # skip empty tracks
        if counts_per_track is None:
            continue

        if samples_outfile:
            samples_outfile.close()

        sampled_counts[track] = counts_per_track

        # old code, refactor into loop to save samples
        if 0:
            E.info("sampling stats: %s" % str(counts))
            if track not in samples:
                E.warn("no samples for track %s" % track)
                continue

            # clean up samples
            del samples[track]

    E.info("sampling finished")

    # build annotator results
    E.info("computing PValue statistics")

    annotator_results = list()
    counter_id = 0
    for counter, observed_count in zip(counters, observed_counts):
        for track, r in observed_count.items():
            for annotation, observed in r.items():
                temp_segs, temp_annos, temp_workspace = workspace_generator(
                    segments[track],
                    annotations[annotation],
                    workspace)

                # ignore empty results
                if temp_workspace.sum() == 0:
                    continue

                # if reference is given, p-value will indicate difference
                # The test that track and annotation are present is done
                # elsewhere
                if reference:
                    ref = reference[track][annotation]
                else:
                    ref = None

                annotator_results.append(Engine.AnnotatorResultExtended(
                    track=track,
                    annotation=annotation,
                    counter=counter.name,
                    observed=observed,
                    samples=sampled_counts[track][counter_id][annotation],
                    track_segments=temp_segs,
                    annotation_segments=temp_annos,
                    workspace=temp_workspace,
                    reference=ref,
                    pseudo_count=pseudo_count))
        counter_id += 1

    # dump (large) table with counts
    if output_counts_pattern:
        for counter in counters:
            name = counter.name
            filename = re.sub("%s", name, output_counts_pattern)

            E.info("writing counts to %s" % filename)
            output = [x for x in annotator_results if x.counter == name]
            outfile = IOTools.openFile(filename, "w")
            outfile.write("track\tannotation\tobserved\tcounts\n")

            for o in output:
                outfile.write("%s\t%s\t%i\t%s\n" %
                              (o.track, o.annotation,
                               o.observed,
                               ",".join(["%i" % x for x in o.samples])))

    return annotator_results
Example #2
0
def computeSample(args):
    '''compute a single sample.
    '''

    workdata, samples_outfile, metrics_outfile, lock = args

    (track,
     sample_id,
     sampler,
     segs,
     annotations,
     contig_annotations,
     workspace,
     contig_workspace,
     counters) = workdata

    # E.debug("track=%s, sample=%s - started" % (track, str(sample_id)))

    counts = E.Counter()

    sample_id = str(sample_id)

    outf_samples = samples_outfile

    if samples_outfile:
        if lock:
            lock.acquire()
            outf_samples = IOTools.openFile(samples_outfile, "a")

        samples_outfile.write("track name=%s\n" % sample_id)

        if lock:
            outf_samples.close()
            lock.release()

    sample = Engine.IntervalDictionary()

    for isochore in list(segs.keys()):

        counts.pairs += 1

        # skip empty isochores
        if workspace[isochore].isEmpty or segs[isochore].isEmpty:
            counts.skipped += 1
            continue

        counts.sampled += 1
        r = sampler.sample(segs[isochore], workspace[isochore])

        # TODO : activate
        # self.outputSampleStats( sample_id, isochore, r )

        sample.add(isochore, r)

        # save sample
        if samples_outfile:
            if lock:
                lock.acquire()
                outf_samples = IOTools.openFile(samples_outfile, "a")

            for start, end in r:
                outf_samples.write("%s\t%i\t%i\n" % (isochore, start, end))

            if lock:
                outf_samples.close()
                lock.release()

    # re-combine isochores
    # adjacent intervals are merged.
    sample.fromIsochores()

    if metrics_outfile:
        if lock:
            lock.acquire()
            outf = IOTools.openFile(metrics_outfile, "a")
        else:
            outf = metrics_outfile

        IO.outputMetrics(outf, sample, workspace, track, sample_id)

        if lock:
            outf.close()
            lock.release()

    counts_per_track = [collections.defaultdict(float) for x in counters]
    # compute counts for each counter
    for counter_id, counter in enumerate(counters):
        # TODO: choose aggregator
        for annotation in annotations.tracks:
            counts_per_track[counter_id][annotation] = sum([
                counter(sample[contig],
                        contig_annotations[annotation][contig],
                        contig_workspace[contig])
                for contig in list(sample.keys())])

    # E.debug("track=%s, sample=%s - completed" % (track,str(sample_id )))

    return counts_per_track