Example #1
0
 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)
Example #2
0
 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)
Example #3
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-a", "--annotations-tsv-file", dest="filename_annotations", type="string",
                      help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].")

    parser.add_option("-r", "--resolution", dest="resolution", type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="number of bins in count vector [default=%default].")

    parser.add_option("-i", "--num-samples", dest="num_samples", type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option("-w", "--workspace-bed-file", dest="filename_workspace", type="string",
                      help="filename with workspace information [default=%default].")

    parser.add_option("--workspace-builder", dest="workspace_builder", type="choice",
                      choices=(
                          "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
                      help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option("--workspace-labels", dest="workspace_labels", type="choice",
                      choices=("none", "direction", "annotation"),
                      help="labels to use for the workspace workspace [default=%default].")

    parser.add_option("--sampler", dest="sampler", type="choice",
                      choices=("permutation", "gaps"),
                      help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]")

    parser.add_option("--counter", dest="counters", type="choice", action="append",
                      choices=(
                          "transcription", "closest-distance", "all-distances"),
                      help="counter to use. The counter computes the quantity of interest [default=%default]")

    parser.add_option("--analysis", dest="analysis", type="choice", action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts", dest="transform_counts", type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option("-s", "--segments", dest="filename_segments", type="string",
                      help="filename with segment information [default=%default].")

    parser.add_option("--xrange", dest="xrange", type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o", "--logscale", dest="logscale", type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p", "--plot", dest="plot", action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy", dest="hardcopy", type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr", dest="do_fdr", action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format", dest="segments_format", type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option("--truncate", dest="truncate", action="store_true",
                      help="truncate segments extending beyond a workspace [default=%default]")

    parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true",
                      help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true",
                      help="keep segments extending to more than one workspace [default=%default]")

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations.")

    ###########################################
    # read data
    if options.workspace_labels == "annotation":
        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown",)))

        map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace, "r"),
                              options.workspace_builder,
                              options.workspace_labels,
                              map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"), indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" %
               (options.num_bins * options.resolution,
                options.num_bins,
                options.resolution,
                ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1),
                len(labels),
                options.num_samples,
                options.num_bins,
                ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(
            labels, options.num_bins, options.resolution, dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels, options.num_bins, options.resolution, dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2,
                len(workspace),
                contig,
                len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(
                    observed, work_start, work_end, left_labels, right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels, right_labels)

    E.info("counting finished")
    E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
           (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(("label", "observed", "pvalue",
                                           "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug("out of bounds: sample %i, label %s, counts=%i" %
                            (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" %
               (outofbounds_obs, totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim, totals_sim,
                100.0 * outofbounds_sim / totals_sim,
                ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1, total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                          (label,
                                           start * options.resolution,
                                           end * options.resolution,
                                           (end - start) * options.resolution,
                                           total_obs,
                                           total_mean,
                                           delta,
                                           total_obs / total_mean,
                                           100.0 * (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                  (label,
                                   start * options.resolution,
                                   end * options.resolution,
                                   (end - start) * options.resolution,
                                   total_obs,
                                   total_mean,
                                   delta,
                                   total_obs / total_mean,
                                   100.0 * (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" %
                                        (label,
                                         st.observed *
                                         options.resolution,
                                         st.pvalue,
                                         st.expected *
                                         options.resolution,
                                         st.ci95lower *
                                         options.resolution,
                                         st.ci95upper *
                                         options.resolution,
                                         IOTools.val2str(st.qvalue),
                                         segments_per_label[label],
                                         workspaces_per_label[label],
                                         ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(
            data, bins=numpy.arange(0, max(data), 100), new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange(
            0, max(segments_per_workspace), 1), new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(
            range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)), [segments_per_label[x]
                                         for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()
Example #4
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]."
    )

    parser.add_option("-r",
                      "--resolution",
                      dest="resolution",
                      type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option(
        "-b",
        "--num-bins",
        dest="num_bins",
        type="int",
        help="number of bins in count vector [default=%default].")

    parser.add_option("-i",
                      "--num-samples",
                      dest="num_samples",
                      type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option(
        "-w",
        "--workspace-bed-file",
        dest="filename_workspace",
        type="string",
        help="filename with workspace information [default=%default].")

    parser.add_option(
        "--workspace-builder",
        dest="workspace_builder",
        type="choice",
        choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
        help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option(
        "--workspace-labels",
        dest="workspace_labels",
        type="choice",
        choices=("none", "direction", "annotation"),
        help="labels to use for the workspace workspace [default=%default].")

    parser.add_option(
        "--sampler",
        dest="sampler",
        type="choice",
        choices=("permutation", "gaps"),
        help=
        "sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]"
    )

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("transcription", "closest-distance", "all-distances"),
        help=
        "counter to use. The counter computes the quantity of interest [default=%default]"
    )

    parser.add_option("--analysis",
                      dest="analysis",
                      type="choice",
                      action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts",
                      dest="transform_counts",
                      type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option(
        "-s",
        "--segments",
        dest="filename_segments",
        type="string",
        help="filename with segment information [default=%default].")

    parser.add_option("--xrange",
                      dest="xrange",
                      type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr",
                      dest="do_fdr",
                      action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format",
                      dest="segments_format",
                      type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help="truncate segments extending beyond a workspace [default=%default]"
    )

    parser.add_option(
        "--remove-overhangs",
        dest="remove_overhangs",
        action="store_true",
        help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option(
        "--keep-ambiguous",
        dest="keep_ambiguous",
        action="store_true",
        help=
        "keep segments extending to more than one workspace [default=%default]"
    )

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations."
        )

    ###########################################
    # read data
    if options.workspace_labels == "annotation":

        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown", )))

        map_id2annotations = IOTools.readMultiMap(open(
            options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace,
                                   "r"), options.workspace_builder,
                              options.workspace_labels, map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"),
                            indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" % (
            options.num_bins * options.resolution,
            options.num_bins,
            options.resolution,
        ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (
                   options.num_bins * len(labels) * dtype().itemsize *
                   (options.num_samples + 1),
                   len(labels),
                   options.num_samples,
                   options.num_bins,
               ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(labels,
                                    options.num_bins,
                                    options.resolution,
                                    dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels,
                        options.num_bins,
                        options.resolution,
                        dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2, len(workspace), contig, len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(observed, work_start,
                                                  work_end, left_labels,
                                                  right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels,
                        right_labels)

    E.info("counting finished")
    E.info(
        "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
        (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(
            ("label", "observed", "pvalue", "expected", "CIlower", "CIupper",
             "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug(
                        "out of bounds: sample %i, label %s, counts=%i" %
                        (sample, label,
                         counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info(
            "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)"
            % (
                outofbounds_obs,
                totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim,
                totals_sim,
                100.0 * outofbounds_sim / totals_sim,
            ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label],
                    counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1,
                                   total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write(
                            "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                            (label, start * options.resolution,
                             end * options.resolution,
                             (end - start) * options.resolution, total_obs,
                             total_mean, delta, total_obs / total_mean, 100.0 *
                             (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write(
                    "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                    (label, start * options.resolution,
                     end * options.resolution,
                     (end - start) * options.resolution, total_obs, total_mean,
                     delta, total_obs / total_mean, 100.0 *
                     (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write(
                    "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (
                        label,
                        st.observed * options.resolution,
                        st.pvalue,
                        st.expected * options.resolution,
                        st.ci95lower * options.resolution,
                        st.ci95upper * options.resolution,
                        IOTools.val2str(st.qvalue),
                        segments_per_label[label],
                        workspaces_per_label[label],
                    ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(data,
                                     bins=numpy.arange(0, max(data), 100),
                                     new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(os.path.expanduser(options.hardcopy %
                                           "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace,
                                     bins=numpy.arange(
                                         0, max(segments_per_workspace), 1),
                                     new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(range(0, len(labels)),
                 [workspaces_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)),
                 [segments_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()