Example #1
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=(
                          "plain", "fasta", "clustal", "stockholm", "phylip"),
                      help="input format of multiple alignment [default=%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=(
                          "plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"),
                      help="output format of multiple alignment [default=%default].")

    parser.add_option("--with-ranges", dest="with_ranges", action="store_true",
                      help="output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("--without-ranges", dest="with_ranges", action="store_false",
                      help="do not output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="string",
                      help="""methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""  )

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="parameter stack for methods that require one [default=%default].")

    parser.add_option("-a", "--mask-char", dest="mask_char", type="string",
                      help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions",
                        "keep-even-segments", "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(open(options.parameters[0], "r"),
                                                          map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(
                open(options.parameters[0], "r"), format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Example #2
0
                                               options.section,
                                               outfile_synonyms = outfile_synonyms,
                                               max_length = options.max_length,
                                               remove_regex = options.remove_regex )
            
        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map" 

        iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ),
                                          feature=options.feature )

        geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") )

        category2segments = collections.defaultdict( list )

        for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():
            if options.remove_regex and options.remove_regex.search( contig ): continue
            
            for start, end, geneid in gffs:
                if geneid not in geneid2categories: continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1                        
            
        for category, segments in category2segments.iteritems():
Example #3
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Example #4
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-a", "--filename-annotations", dest="filename_annotations", type="string",
                      help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].")

    parser.add_option("-r", "--resolution", dest="resolution", type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="number of bins in count vector [default=%default].")

    parser.add_option("-i", "--num-samples", dest="num_samples", type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option("-w", "--workspace", dest="filename_workspace", type="string",
                      help="filename with workspace information [default=%default].")

    parser.add_option("--workspace-builder", dest="workspace_builder", type="choice",
                      choices=(
                          "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
                      help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option("--workspace-labels", dest="workspace_labels", type="choice",
                      choices=("none", "direction", "annotation"),
                      help="labels to use for the workspace workspace [default=%default].")

    parser.add_option("--sampler", dest="sampler", type="choice",
                      choices=("permutation", "gaps"),
                      help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]")

    parser.add_option("--counter", dest="counters", type="choice", action="append",
                      choices=(
                          "transcription", "closest-distance", "all-distances"),
                      help="counter to use. The counter computes the quantity of interest [default=%default]")

    parser.add_option("--analysis", dest="analysis", type="choice", action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts", dest="transform_counts", type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option("-s", "--segments", dest="filename_segments", type="string",
                      help="filename with segment information [default=%default].")

    parser.add_option("--xrange", dest="xrange", type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o", "--logscale", dest="logscale", type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p", "--plot", dest="plot", action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy", dest="hardcopy", type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr", dest="do_fdr", action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format", dest="segments_format", type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option("--truncate", dest="truncate", action="store_true",
                      help="truncate segments extending beyond a workspace [default=%default]")

    parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true",
                      help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true",
                      help="keep segments extending to more than one workspace [default=%default]")

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --filename-annotations is --workspace-labels=annotations.")

    ###########################################
    # read data
    if options.workspace_labels == "annotation":
        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown",)))

        map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace, "r"),
                              options.workspace_builder,
                              options.workspace_labels,
                              map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"), indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" %
               (options.num_bins * options.resolution,
                options.num_bins,
                options.resolution,
                ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1),
                len(labels),
                options.num_samples,
                options.num_bins,
                ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(
            labels, options.num_bins, options.resolution, dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels, options.num_bins, options.resolution, dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2,
                len(workspace),
                contig,
                len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(
                    observed, work_start, work_end, left_labels, right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels, right_labels)

    E.info("counting finished")
    E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
           (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(("label", "observed", "pvalue",
                                           "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug("out of bounds: sample %i, label %s, counts=%i" %
                            (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" %
               (outofbounds_obs, totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim, totals_sim,
                100.0 * outofbounds_sim / totals_sim,
                ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1, total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                          (label,
                                           start * options.resolution,
                                           end * options.resolution,
                                           (end - start) * options.resolution,
                                           total_obs,
                                           total_mean,
                                           delta,
                                           total_obs / total_mean,
                                           100.0 * (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                  (label,
                                   start * options.resolution,
                                   end * options.resolution,
                                   (end - start) * options.resolution,
                                   total_obs,
                                   total_mean,
                                   delta,
                                   total_obs / total_mean,
                                   100.0 * (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" %
                                        (label,
                                         st.observed *
                                         options.resolution,
                                         st.pvalue,
                                         st.expected *
                                         options.resolution,
                                         st.ci95lower *
                                         options.resolution,
                                         st.ci95upper *
                                         options.resolution,
                                         IOTools.prettyFloat(st.qvalue),
                                         segments_per_label[label],
                                         workspaces_per_label[label],
                                         ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(
            data, bins=numpy.arange(0, max(data), 100), new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange(
            0, max(segments_per_workspace), 1), new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(
            range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)), [segments_per_label[x]
                 for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()
Example #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option(
        "--output-filename-synonyms",
        dest="output_filename_synonyms",
        type="string",
        help=
        "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."
    )

    parser.add_option("-m",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "annotations-genes",
                               "annotations-go", "workspace",
                               "annotations-gff"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.add_option(
        "--remove-regex",
        dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.set_defaults(
        genome_file=None,
        feature=None,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        output_filename_synonyms=None,
        input_format="gff",
        remove_regex=None,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0:
        options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith("annotations"):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile(options.remove_regex)

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered(GFF.iterator(options.stdin),
                                         feature=options.feature)

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals = GTF.readAsIntervals(iterator, with_records=with_records)
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments(options.stdout,
                                              intervals,
                                              options.section,
                                              outfile_synonyms=outfile_synonyms,
                                              max_length=options.max_length,
                                              remove_regex=options.remove_regex)

        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map"

        iterator = GTF.iterator_filtered(GTF.iterator(options.stdin),
                                         feature=options.feature)

        geneid2categories = IOTools.readMultiMap(
            open(options.input_filename_map, "r"))

        category2segments = collections.defaultdict(list)

        for contig, gffs in GTF.readAsIntervals(iterator,
                                                with_gene_id=True).items():
            if options.remove_regex and options.remove_regex.search(contig):
                continue

            for start, end, geneid in gffs:
                if geneid not in geneid2categories:
                    continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

        for category, segments in category2segments.iteritems():
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (category, "\t".join(["%i" % x for x in segments])))
            E.info("set %s annotated with %i segments" %
                   (category, len(segments)))

    elif options.section == "annotations":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin),
                                                 feature=options.feature)
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                                     feature=options.feature)

            else:
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GFF.iterator(infile),
                                                     feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                raise ValueError("don't know how to filter %s" % filename)

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator(sys.stdin)
            else:
                iterator = GTF.iterator_filtered(
                    GFF.iterator(open(filename, "r")))

            segments = collections.defaultdict(list)
            for gff in iterator:
                segments[":".join((gff.source, gff.feature))].append(
                    (gff.contig, gff.start, gff.end))

            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    options.stdout.write(
                        "%s\t%i\t%s\t(%i,%i)\n" %
                        (prefix, nsegments, contig, start, end))
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)

        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (feature, "\t".join(["%i" % x for x in xrange(start, end)])))
            E.info("set %s annotated with %i segments" %
                   (feature, end - start))

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile(filename)
            iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                             feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                # output all
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                # create subsets
                E.debug("applying subsets for %s" % filename)
                geneid2label, label2segments = collections.defaultdict(
                    list), {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList(open(filename_ids, "r"))
                    for gene_id in gene_ids:
                        geneid2label[gene_id].append(label)
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals(
                        iterator, with_gene_id=True).items():

                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label:
                            continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)

                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, start, end))
                        nsegments += 1

                for label, segments in label2segments.iteritems():
                    options.stdout.write(
                        "##Ann\t%s\t%s\n" %
                        (label, "\t".join(["%i" % x for x in segments])))
                    E.info("set %s (%s) annotated with %i segments" %
                           (label, filename, len(segments)))

    E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
           (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()
Example #6
0
def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]."
    )

    parser.add_option("-r",
                      "--resolution",
                      dest="resolution",
                      type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option(
        "-b",
        "--num-bins",
        dest="num_bins",
        type="int",
        help="number of bins in count vector [default=%default].")

    parser.add_option("-i",
                      "--num-samples",
                      dest="num_samples",
                      type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option(
        "-w",
        "--workspace-bed-file",
        dest="filename_workspace",
        type="string",
        help="filename with workspace information [default=%default].")

    parser.add_option(
        "--workspace-builder",
        dest="workspace_builder",
        type="choice",
        choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
        help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option(
        "--workspace-labels",
        dest="workspace_labels",
        type="choice",
        choices=("none", "direction", "annotation"),
        help="labels to use for the workspace workspace [default=%default].")

    parser.add_option(
        "--sampler",
        dest="sampler",
        type="choice",
        choices=("permutation", "gaps"),
        help=
        "sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]"
    )

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("transcription", "closest-distance", "all-distances"),
        help=
        "counter to use. The counter computes the quantity of interest [default=%default]"
    )

    parser.add_option("--analysis",
                      dest="analysis",
                      type="choice",
                      action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts",
                      dest="transform_counts",
                      type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option(
        "-s",
        "--segments",
        dest="filename_segments",
        type="string",
        help="filename with segment information [default=%default].")

    parser.add_option("--xrange",
                      dest="xrange",
                      type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr",
                      dest="do_fdr",
                      action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format",
                      dest="segments_format",
                      type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help="truncate segments extending beyond a workspace [default=%default]"
    )

    parser.add_option(
        "--remove-overhangs",
        dest="remove_overhangs",
        action="store_true",
        help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option(
        "--keep-ambiguous",
        dest="keep_ambiguous",
        action="store_true",
        help=
        "keep segments extending to more than one workspace [default=%default]"
    )

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations."
        )

    ###########################################
    # read data
    if options.workspace_labels == "annotation":

        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown", )))

        map_id2annotations = IOTools.readMultiMap(open(
            options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace,
                                   "r"), options.workspace_builder,
                              options.workspace_labels, map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"),
                            indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" % (
            options.num_bins * options.resolution,
            options.num_bins,
            options.resolution,
        ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (
                   options.num_bins * len(labels) * dtype().itemsize *
                   (options.num_samples + 1),
                   len(labels),
                   options.num_samples,
                   options.num_bins,
               ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(labels,
                                    options.num_bins,
                                    options.resolution,
                                    dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels,
                        options.num_bins,
                        options.resolution,
                        dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2, len(workspace), contig, len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(observed, work_start,
                                                  work_end, left_labels,
                                                  right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels,
                        right_labels)

    E.info("counting finished")
    E.info(
        "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
        (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(
            ("label", "observed", "pvalue", "expected", "CIlower", "CIupper",
             "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug(
                        "out of bounds: sample %i, label %s, counts=%i" %
                        (sample, label,
                         counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info(
            "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)"
            % (
                outofbounds_obs,
                totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim,
                totals_sim,
                100.0 * outofbounds_sim / totals_sim,
            ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label],
                    counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1,
                                   total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write(
                            "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                            (label, start * options.resolution,
                             end * options.resolution,
                             (end - start) * options.resolution, total_obs,
                             total_mean, delta, total_obs / total_mean, 100.0 *
                             (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write(
                    "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                    (label, start * options.resolution,
                     end * options.resolution,
                     (end - start) * options.resolution, total_obs, total_mean,
                     delta, total_obs / total_mean, 100.0 *
                     (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write(
                    "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (
                        label,
                        st.observed * options.resolution,
                        st.pvalue,
                        st.expected * options.resolution,
                        st.ci95lower * options.resolution,
                        st.ci95upper * options.resolution,
                        IOTools.val2str(st.qvalue),
                        segments_per_label[label],
                        workspaces_per_label[label],
                    ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(data,
                                     bins=numpy.arange(0, max(data), 100),
                                     new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(os.path.expanduser(options.hardcopy %
                                           "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace,
                                     bins=numpy.arange(
                                         0, max(segments_per_workspace), 1),
                                     new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(range(0, len(labels)),
                 [workspaces_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)),
                 [segments_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"])

        
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-f", "--features", dest="features", type="string", 
                       help="feature to collect [default=None]."  )

    parser.add_option( "-i", "--files", dest="files", action="append",
                       help="use multiple annotations [default=None]."  )

    parser.add_option(  "-a", "--annotations", dest="annotations", type="string", 
                       help="aggregate name for annotations if only single file is provided from STDIN [default=None]."  )

    parser.add_option(  "--input-filename-map", dest="input_filename_map", type="string", 
                       help="filename with a map of gene_ids to categories [default=None]."  )

    parser.add_option(  "--output-filename-synonyms", dest="output_filename_synonyms", type="string", 
                       help="output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."  )

    parser.add_option( "-m", "--max-length", dest="max_length", type="string", 
                       help="maximum segment length [default=None]."  )

    parser.add_option( "-s", "--section", dest="section", type="choice", 
                       choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff" ),
                       help="annotator section [default=None]."  )

    parser.add_option( "--subset", dest="subsets", type="string", action="append",
                       help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."  )

    parser.add_option( "--remove-regex", dest="remove_regex", type="string", 
                       help="regular expression of contigs to remove [default=None]."  )

    parser.set_defaults(
        genome_file = None,
        feature = None,
        section = "segments",
        annotations = "annotations",
        max_length = 100000,
        files = [],
        subsets = [],
        input_filename_map = None,
        output_filename_synonyms = None,
        input_format = "gff",
        remove_regex = None,
        )

    (options, args) = E.Start( parser )

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) )

    if options.subsets:
        subsets = collections.defaultdict( list )
        for s in options.subsets: 
            filename_gff,label,filename_ids = s.split( "," )
            subsets[filename_gff].append( (label,filename_ids) )
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith( "annotations" ):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)
        
    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile( options.remove_regex )

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered( GFF.iterator( options.stdin ),
                                          feature=options.feature )

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals =GTF.readAsIntervals( iterator, with_records = with_records )
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments( options.stdout,
                                               intervals,
                                               options.section,
                                               outfile_synonyms = outfile_synonyms,
                                               max_length = options.max_length,
                                               remove_regex = options.remove_regex )
            
        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map" 

        iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ),
                                          feature=options.feature )

        geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") )

        category2segments = collections.defaultdict( list )

        for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():
            if options.remove_regex and options.remove_regex.search( contig ): continue
            
            for start, end, geneid in gffs:
                if geneid not in geneid2categories: continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1                        
            
        for category, segments in category2segments.iteritems():
            options.stdout.write("##Ann\t%s\t%s\n" % (category, "\t".join( ["%i" % x for x in segments ] ) ) )
            E.info( "set %s annotated with %i segments" % (category, len(segments)) )

    elif options.section == "annotations":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered( GFF.iterator( sys.stdin ),
                                                  feature=options.feature )
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                                      feature=options.feature )
                
            else:
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GFF.iterator( infile ),
                                                      feature=options.feature )
           
            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                raise ValueError("don't know how to filter %s" % filename )

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator( sys.stdin )
            else:
                iterator = GTF.iterator_filtered( GFF.iterator( open( filename, "r") ) )

            segments = collections.defaultdict( list )
            for gff in iterator:
                segments[":".join((gff.source,gff.feature))].append( (gff.contig,gff.start, gff.end) )
  
            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)
            
        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write("##Ann\t%s\t%s\n" % (feature, "\t".join( ["%i" % x for x in xrange( start,end) ] ) ) )
            E.info( "set %s annotated with %i segments" % (feature, end-start) )

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile( filename )
            iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                              feature=options.feature )
                
            E.debug("processing %s" % (filename))
            
            if not options.subsets or filename not in options.subsets:
                ## output all
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                ## create subsets
                E.debug("applying subsets for %s" % filename )
                geneid2label, label2segments = collections.defaultdict(list) , {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList( open(filename_ids, "r") )
                    for gene_id in gene_ids: geneid2label[gene_id].append( label )
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():

                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label: continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)
                            
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                        nsegments += 1                        
                        
                for label, segments in label2segments.iteritems():
                    options.stdout.write("##Ann\t%s\t%s\n" % (label, "\t".join( ["%i" % x for x in segments ] ) ) )
                    E.info( "set %s (%s) annotated with %i segments" % (label, filename, len(segments)) )

    E.info( "ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()