Python IOTools.val2str Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: val2str

Examples at hotexamples.com: 4

Python IOTools.val2str - 4 examples found. These are the top rated real world Python examples of CGAT.IOTools.val2str extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: GO.py Project: CGATOxford/cgat

 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)

Example #2

Show file

File: GO.py Project: wangdi2014/cgat

 def __str__(self):
     """return string representation."""
     return "%i\t%i\t%s\t%i\t%i\t%s\t%s\t%6.4e\t%6.4e\t%6.4e" % \
         (self.mSampleCountsCategory,
          self.mSampleCountsTotal,
          IOTools.prettyPercent(
              self.mSampleCountsCategory, self.mSampleCountsTotal),
          self.mBackgroundCountsCategory,
          self.mBackgroundCountsTotal,
          IOTools.prettyPercent(
              self.mBackgroundCountsCategory, self.mBackgroundCountsTotal),
          IOTools.val2str(self.mRatio),
          self.mPValue,
          self.mProbabilityOverRepresentation,
          self.mProbabilityUnderRepresentation)

Example #3

Show file

File: annotator_distance.py Project: SCV/cgat

def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-a", "--annotations-tsv-file", dest="filename_annotations", type="string",
                      help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].")

    parser.add_option("-r", "--resolution", dest="resolution", type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="number of bins in count vector [default=%default].")

    parser.add_option("-i", "--num-samples", dest="num_samples", type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option("-w", "--workspace-bed-file", dest="filename_workspace", type="string",
                      help="filename with workspace information [default=%default].")

    parser.add_option("--workspace-builder", dest="workspace_builder", type="choice",
                      choices=(
                          "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
                      help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option("--workspace-labels", dest="workspace_labels", type="choice",
                      choices=("none", "direction", "annotation"),
                      help="labels to use for the workspace workspace [default=%default].")

    parser.add_option("--sampler", dest="sampler", type="choice",
                      choices=("permutation", "gaps"),
                      help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]")

    parser.add_option("--counter", dest="counters", type="choice", action="append",
                      choices=(
                          "transcription", "closest-distance", "all-distances"),
                      help="counter to use. The counter computes the quantity of interest [default=%default]")

    parser.add_option("--analysis", dest="analysis", type="choice", action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts", dest="transform_counts", type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option("-s", "--segments", dest="filename_segments", type="string",
                      help="filename with segment information [default=%default].")

    parser.add_option("--xrange", dest="xrange", type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o", "--logscale", dest="logscale", type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p", "--plot", dest="plot", action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy", dest="hardcopy", type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr", dest="do_fdr", action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format", dest="segments_format", type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option("--truncate", dest="truncate", action="store_true",
                      help="truncate segments extending beyond a workspace [default=%default]")

    parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true",
                      help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true",
                      help="keep segments extending to more than one workspace [default=%default]")

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations.")

    ###########################################
    # read data
    if options.workspace_labels == "annotation":
        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown",)))

        map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace, "r"),
                              options.workspace_builder,
                              options.workspace_labels,
                              map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"), indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" %
               (options.num_bins * options.resolution,
                options.num_bins,
                options.resolution,
                ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1),
                len(labels),
                options.num_samples,
                options.num_bins,
                ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(
            labels, options.num_bins, options.resolution, dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels, options.num_bins, options.resolution, dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2,
                len(workspace),
                contig,
                len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(
                    observed, work_start, work_end, left_labels, right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels, right_labels)

    E.info("counting finished")
    E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
           (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(("label", "observed", "pvalue",
                                           "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug("out of bounds: sample %i, label %s, counts=%i" %
                            (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" %
               (outofbounds_obs, totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim, totals_sim,
                100.0 * outofbounds_sim / totals_sim,
                ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1, total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                          (label,
                                           start * options.resolution,
                                           end * options.resolution,
                                           (end - start) * options.resolution,
                                           total_obs,
                                           total_mean,
                                           delta,
                                           total_obs / total_mean,
                                           100.0 * (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                  (label,
                                   start * options.resolution,
                                   end * options.resolution,
                                   (end - start) * options.resolution,
                                   total_obs,
                                   total_mean,
                                   delta,
                                   total_obs / total_mean,
                                   100.0 * (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" %
                                        (label,
                                         st.observed *
                                         options.resolution,
                                         st.pvalue,
                                         st.expected *
                                         options.resolution,
                                         st.ci95lower *
                                         options.resolution,
                                         st.ci95upper *
                                         options.resolution,
                                         IOTools.val2str(st.qvalue),
                                         segments_per_label[label],
                                         workspaces_per_label[label],
                                         ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(
            data, bins=numpy.arange(0, max(data), 100), new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange(
            0, max(segments_per_workspace), 1), new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(
            range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)), [segments_per_label[x]
                                         for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()

Example #4

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]."
    )

    parser.add_option("-r",
                      "--resolution",
                      dest="resolution",
                      type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option(
        "-b",
        "--num-bins",
        dest="num_bins",
        type="int",
        help="number of bins in count vector [default=%default].")

    parser.add_option("-i",
                      "--num-samples",
                      dest="num_samples",
                      type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option(
        "-w",
        "--workspace-bed-file",
        dest="filename_workspace",
        type="string",
        help="filename with workspace information [default=%default].")

    parser.add_option(
        "--workspace-builder",
        dest="workspace_builder",
        type="choice",
        choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
        help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option(
        "--workspace-labels",
        dest="workspace_labels",
        type="choice",
        choices=("none", "direction", "annotation"),
        help="labels to use for the workspace workspace [default=%default].")

    parser.add_option(
        "--sampler",
        dest="sampler",
        type="choice",
        choices=("permutation", "gaps"),
        help=
        "sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]"
    )

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("transcription", "closest-distance", "all-distances"),
        help=
        "counter to use. The counter computes the quantity of interest [default=%default]"
    )

    parser.add_option("--analysis",
                      dest="analysis",
                      type="choice",
                      action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts",
                      dest="transform_counts",
                      type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option(
        "-s",
        "--segments",
        dest="filename_segments",
        type="string",
        help="filename with segment information [default=%default].")

    parser.add_option("--xrange",
                      dest="xrange",
                      type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr",
                      dest="do_fdr",
                      action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format",
                      dest="segments_format",
                      type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help="truncate segments extending beyond a workspace [default=%default]"
    )

    parser.add_option(
        "--remove-overhangs",
        dest="remove_overhangs",
        action="store_true",
        help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option(
        "--keep-ambiguous",
        dest="keep_ambiguous",
        action="store_true",
        help=
        "keep segments extending to more than one workspace [default=%default]"
    )

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations."
        )

    ###########################################
    # read data
    if options.workspace_labels == "annotation":

        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown", )))

        map_id2annotations = IOTools.readMultiMap(open(
            options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace,
                                   "r"), options.workspace_builder,
                              options.workspace_labels, map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"),
                            indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" % (
            options.num_bins * options.resolution,
            options.num_bins,
            options.resolution,
        ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (
                   options.num_bins * len(labels) * dtype().itemsize *
                   (options.num_samples + 1),
                   len(labels),
                   options.num_samples,
                   options.num_bins,
               ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(labels,
                                    options.num_bins,
                                    options.resolution,
                                    dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels,
                        options.num_bins,
                        options.resolution,
                        dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2, len(workspace), contig, len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(observed, work_start,
                                                  work_end, left_labels,
                                                  right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels,
                        right_labels)

    E.info("counting finished")
    E.info(
        "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
        (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(
            ("label", "observed", "pvalue", "expected", "CIlower", "CIupper",
             "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug(
                        "out of bounds: sample %i, label %s, counts=%i" %
                        (sample, label,
                         counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info(
            "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)"
            % (
                outofbounds_obs,
                totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim,
                totals_sim,
                100.0 * outofbounds_sim / totals_sim,
            ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label],
                    counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1,
                                   total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write(
                            "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                            (label, start * options.resolution,
                             end * options.resolution,
                             (end - start) * options.resolution, total_obs,
                             total_mean, delta, total_obs / total_mean, 100.0 *
                             (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write(
                    "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                    (label, start * options.resolution,
                     end * options.resolution,
                     (end - start) * options.resolution, total_obs, total_mean,
                     delta, total_obs / total_mean, 100.0 *
                     (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write(
                    "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (
                        label,
                        st.observed * options.resolution,
                        st.pvalue,
                        st.expected * options.resolution,
                        st.ci95lower * options.resolution,
                        st.ci95upper * options.resolution,
                        IOTools.val2str(st.qvalue),
                        segments_per_label[label],
                        workspaces_per_label[label],
                    ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(data,
                                     bins=numpy.arange(0, max(data), 100),
                                     new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(os.path.expanduser(options.hardcopy %
                                           "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace,
                                     bins=numpy.arange(
                                         0, max(segments_per_workspace), 1),
                                     new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(range(0, len(labels)),
                 [workspaces_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)),
                 [segments_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()