Python IOTools.readMultiMap Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: IOTools

Method/Function: readMultiMap

Examples at hotexamples.com: 7

Python IOTools.readMultiMap - 7 examples found. These are the top rated real world Python examples of CGAT.IOTools.readMultiMap extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

openFile(30)

ReadMap(23)

ReadList(21)

isEmpty(14)

writeLines(9)

readMap(9)

which(8)

getInvertedDictionary(7)

readList(7)

prettyPercent(7)

zapFile(6)

convertDictionary(6)

snip(5)

FilePool(5)

iterate(5)

getNumLines(4)

readTable(4)

flatten(4)

readMultiMap(3)

str2val(3)

touchFile(3)

writeMatrix(3)

isComplete(2)

getLastLine(2)

readMatrix(2)

val2str(2)

human2bytes(1)

force_str(1)

cloneFile(1)

prettyFloat(1)

Example #1

Show file

File: mali2mali.py Project: CGATOxford/Optic

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=(
                          "plain", "fasta", "clustal", "stockholm", "phylip"),
                      help="input format of multiple alignment [default=%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=(
                          "plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"),
                      help="output format of multiple alignment [default=%default].")

    parser.add_option("--with-ranges", dest="with_ranges", action="store_true",
                      help="output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("--without-ranges", dest="with_ranges", action="store_false",
                      help="do not output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="string",
                      help="""methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""  )

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="parameter stack for methods that require one [default=%default].")

    parser.add_option("-a", "--mask-char", dest="mask_char", type="string",
                      help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions",
                        "keep-even-segments", "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(open(options.parameters[0], "r"),
                                                          map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(
                open(options.parameters[0], "r"), format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()

Example #2

Show file

File: gff2annotator.py Project: siping/cgat

                                               options.section,
                                               outfile_synonyms = outfile_synonyms,
                                               max_length = options.max_length,
                                               remove_regex = options.remove_regex )
            
        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map" 

        iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ),
                                          feature=options.feature )

        geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") )

        category2segments = collections.defaultdict( list )

        for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():
            if options.remove_regex and options.remove_regex.search( contig ): continue
            
            for start, end, geneid in gffs:
                if geneid not in geneid2categories: continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1                        
            
        for category, segments in category2segments.iteritems():

Example #3

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-i",
        "--input-format",
        dest="input_format",
        type="choice",
        choices=("plain", "fasta", "clustal", "stockholm", "phylip"),
        help="input format of multiple alignment [default=%default].")

    parser.add_option(
        "-o",
        "--output-format",
        dest="output_format",
        type="choice",
        choices=("plain", "fasta", "stockholm", "phylip", "nexus",
                 "plain-fasta"),
        help="output format of multiple alignment [default=%default].")

    parser.add_option(
        "--with-ranges",
        dest="with_ranges",
        action="store_true",
        help=
        "output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option(
        "--without-ranges",
        dest="with_ranges",
        action="store_false",
        help=
        "do not output alignment ranges (suffix /from-to after identifier) [default=%default]."
    )

    parser.add_option("-u",
                      "--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="string",
        help=
        """methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""
    )

    parser.add_option(
        "-p",
        "--parameters",
        dest="parameters",
        type="string",
        help="parameter stack for methods that require one [default=%default]."
    )

    parser.add_option(
        "-a",
        "--mask-char",
        dest="mask_char",
        type="string",
        help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions",
                        "filter-even-transitions", "keep-even-segments",
                        "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(
                    open(options.parameters[0], "r"), map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(open(options.parameters[0], "r"),
                                    format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()

Example #4

Show file

File: annotator_distance.py Project: lesheng/cgat

def main(argv=sys.argv):

    parser = E.OptionParser(
        version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-a", "--filename-annotations", dest="filename_annotations", type="string",
                      help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].")

    parser.add_option("-r", "--resolution", dest="resolution", type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option("-b", "--num-bins", dest="num_bins", type="int",
                      help="number of bins in count vector [default=%default].")

    parser.add_option("-i", "--num-samples", dest="num_samples", type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option("-w", "--workspace", dest="filename_workspace", type="string",
                      help="filename with workspace information [default=%default].")

    parser.add_option("--workspace-builder", dest="workspace_builder", type="choice",
                      choices=(
                          "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
                      help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option("--workspace-labels", dest="workspace_labels", type="choice",
                      choices=("none", "direction", "annotation"),
                      help="labels to use for the workspace workspace [default=%default].")

    parser.add_option("--sampler", dest="sampler", type="choice",
                      choices=("permutation", "gaps"),
                      help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]")

    parser.add_option("--counter", dest="counters", type="choice", action="append",
                      choices=(
                          "transcription", "closest-distance", "all-distances"),
                      help="counter to use. The counter computes the quantity of interest [default=%default]")

    parser.add_option("--analysis", dest="analysis", type="choice", action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts", dest="transform_counts", type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option("-s", "--segments", dest="filename_segments", type="string",
                      help="filename with segment information [default=%default].")

    parser.add_option("--xrange", dest="xrange", type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o", "--logscale", dest="logscale", type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p", "--plot", dest="plot", action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy", dest="hardcopy", type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr", dest="do_fdr", action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format", dest="segments_format", type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option("--truncate", dest="truncate", action="store_true",
                      help="truncate segments extending beyond a workspace [default=%default]")

    parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true",
                      help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true",
                      help="keep segments extending to more than one workspace [default=%default]")

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --filename-annotations is --workspace-labels=annotations.")

    ###########################################
    # read data
    if options.workspace_labels == "annotation":
        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown",)))

        map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace, "r"),
                              options.workspace_builder,
                              options.workspace_labels,
                              map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"), indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" %
               (options.num_bins * options.resolution,
                options.num_bins,
                options.resolution,
                ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1),
                len(labels),
                options.num_samples,
                options.num_bins,
                ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(
            labels, options.num_bins, options.resolution, dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels, options.num_bins, options.resolution, dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2,
                len(workspace),
                contig,
                len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(
                    observed, work_start, work_end, left_labels, right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels, right_labels)

    E.info("counting finished")
    E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
           (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(("label", "observed", "pvalue",
                                           "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug("out of bounds: sample %i, label %s, counts=%i" %
                            (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" %
               (outofbounds_obs, totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim, totals_sim,
                100.0 * outofbounds_sim / totals_sim,
                ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1, total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                          (label,
                                           start * options.resolution,
                                           end * options.resolution,
                                           (end - start) * options.resolution,
                                           total_obs,
                                           total_mean,
                                           delta,
                                           total_obs / total_mean,
                                           100.0 * (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                                  (label,
                                   start * options.resolution,
                                   end * options.resolution,
                                   (end - start) * options.resolution,
                                   total_obs,
                                   total_mean,
                                   delta,
                                   total_obs / total_mean,
                                   100.0 * (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" %
                                        (label,
                                         st.observed *
                                         options.resolution,
                                         st.pvalue,
                                         st.expected *
                                         options.resolution,
                                         st.ci95lower *
                                         options.resolution,
                                         st.ci95upper *
                                         options.resolution,
                                         IOTools.prettyFloat(st.qvalue),
                                         segments_per_label[label],
                                         workspaces_per_label[label],
                                         ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(
            data, bins=numpy.arange(0, max(data), 100), new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange(
            0, max(segments_per_workspace), 1), new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(
            range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)), [segments_per_label[x]
                 for x in labels], height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()

Example #5

Show file

File: gff2annotator.py Project: lesheng/cgat

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome.")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      help="feature to collect [default=None].")

    parser.add_option("-i",
                      "--files",
                      dest="files",
                      action="append",
                      help="use multiple annotations [default=None].")

    parser.add_option(
        "-a",
        "--annotations",
        dest="annotations",
        type="string",
        help=
        "aggregate name for annotations if only single file is provided from STDIN [default=None]."
    )

    parser.add_option(
        "--input-filename-map",
        dest="input_filename_map",
        type="string",
        help="filename with a map of gene_ids to categories [default=None].")

    parser.add_option(
        "--output-filename-synonyms",
        dest="output_filename_synonyms",
        type="string",
        help=
        "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."
    )

    parser.add_option("-m",
                      "--max-length",
                      dest="max_length",
                      type="string",
                      help="maximum segment length [default=None].")

    parser.add_option("-s",
                      "--section",
                      dest="section",
                      type="choice",
                      choices=("segments", "annotations", "annotations-genes",
                               "annotations-go", "workspace",
                               "annotations-gff"),
                      help="annotator section [default=None].")

    parser.add_option(
        "--subset",
        dest="subsets",
        type="string",
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."
    )

    parser.add_option(
        "--remove-regex",
        dest="remove_regex",
        type="string",
        help="regular expression of contigs to remove [default=None].")

    parser.set_defaults(
        genome_file=None,
        feature=None,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        output_filename_synonyms=None,
        input_format="gff",
        remove_regex=None,
    )

    (options, args) = E.Start(parser)

    options.files += args
    if len(options.files) == 0:
        options.files.append("-")
    options.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in options.files]))

    if options.subsets:
        subsets = collections.defaultdict(list)
        for s in options.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith("annotations"):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)

    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile(options.remove_regex)

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered(GFF.iterator(options.stdin),
                                         feature=options.feature)

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals = GTF.readAsIntervals(iterator, with_records=with_records)
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments(options.stdout,
                                              intervals,
                                              options.section,
                                              outfile_synonyms=outfile_synonyms,
                                              max_length=options.max_length,
                                              remove_regex=options.remove_regex)

        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map"

        iterator = GTF.iterator_filtered(GTF.iterator(options.stdin),
                                         feature=options.feature)

        geneid2categories = IOTools.readMultiMap(
            open(options.input_filename_map, "r"))

        category2segments = collections.defaultdict(list)

        for contig, gffs in GTF.readAsIntervals(iterator,
                                                with_gene_id=True).items():
            if options.remove_regex and options.remove_regex.search(contig):
                continue

            for start, end, geneid in gffs:
                if geneid not in geneid2categories:
                    continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                     (prefix, nsegments, contig, start, end))
                nsegments += 1

        for category, segments in category2segments.iteritems():
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (category, "\t".join(["%i" % x for x in segments])))
            E.info("set %s annotated with %i segments" %
                   (category, len(segments)))

    elif options.section == "annotations":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin),
                                                 feature=options.feature)
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                                     feature=options.feature)

            else:
                with open(filename, "r") as infile:
                    iterator = GTF.iterator_filtered(GFF.iterator(infile),
                                                     feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                raise ValueError("don't know how to filter %s" % filename)

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator(sys.stdin)
            else:
                iterator = GTF.iterator_filtered(
                    GFF.iterator(open(filename, "r")))

            segments = collections.defaultdict(list)
            for gff in iterator:
                segments[":".join((gff.source, gff.feature))].append(
                    (gff.contig, gff.start, gff.end))

            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    options.stdout.write(
                        "%s\t%i\t%s\t(%i,%i)\n" %
                        (prefix, nsegments, contig, start, end))
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)

        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write(
                "##Ann\t%s\t%s\n" %
                (feature, "\t".join(["%i" % x for x in xrange(start, end)])))
            E.info("set %s annotated with %i segments" %
                   (feature, end - start))

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info("adding filename %s" % filename)

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile(filename)
            iterator = GTF.iterator_filtered(GTF.iterator(infile),
                                             feature=options.feature)

            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                # output all
                for contig, gffs in GTF.readAsIntervals(iterator).items():
                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for x in gffs:
                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, x[0], x[1]))
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join(
                    ["%i" % x for x in range(start, nsegments)])))
                E.info("set %s annotated with %i segments" %
                       (filename, nsegments - start))

            else:
                # create subsets
                E.debug("applying subsets for %s" % filename)
                geneid2label, label2segments = collections.defaultdict(
                    list), {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList(open(filename_ids, "r"))
                    for gene_id in gene_ids:
                        geneid2label[gene_id].append(label)
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals(
                        iterator, with_gene_id=True).items():

                    if options.remove_regex and options.remove_regex.search(
                            contig):
                        continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label:
                            continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)

                        options.stdout.write(
                            "%s\t%i\t%s\t(%i,%i)\n" %
                            (prefix, nsegments, contig, start, end))
                        nsegments += 1

                for label, segments in label2segments.iteritems():
                    options.stdout.write(
                        "##Ann\t%s\t%s\n" %
                        (label, "\t".join(["%i" % x for x in segments])))
                    E.info("set %s (%s) annotated with %i segments" %
                           (label, filename, len(segments)))

    E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
           (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()

Example #6

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-a",
        "--annotations-tsv-file",
        dest="filename_annotations",
        type="string",
        help=
        "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]."
    )

    parser.add_option("-r",
                      "--resolution",
                      dest="resolution",
                      type="int",
                      help="resolution of count vector [default=%default].")

    parser.add_option(
        "-b",
        "--num-bins",
        dest="num_bins",
        type="int",
        help="number of bins in count vector [default=%default].")

    parser.add_option("-i",
                      "--num-samples",
                      dest="num_samples",
                      type="int",
                      help="sample size to compute [default=%default].")

    parser.add_option(
        "-w",
        "--workspace-bed-file",
        dest="filename_workspace",
        type="string",
        help="filename with workspace information [default=%default].")

    parser.add_option(
        "--workspace-builder",
        dest="workspace_builder",
        type="choice",
        choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"),
        help="given a gff/gtf file build a workspace [default=%default].")

    parser.add_option(
        "--workspace-labels",
        dest="workspace_labels",
        type="choice",
        choices=("none", "direction", "annotation"),
        help="labels to use for the workspace workspace [default=%default].")

    parser.add_option(
        "--sampler",
        dest="sampler",
        type="choice",
        choices=("permutation", "gaps"),
        help=
        "sampler to use. The sampler determines the null model of how segments are distributed in the workspace  [default=%default]"
    )

    parser.add_option(
        "--counter",
        dest="counters",
        type="choice",
        action="append",
        choices=("transcription", "closest-distance", "all-distances"),
        help=
        "counter to use. The counter computes the quantity of interest [default=%default]"
    )

    parser.add_option("--analysis",
                      dest="analysis",
                      type="choice",
                      action="append",
                      choices=("proximity", "area-under-curve"),
                      help="analysis to perform [default=%default]")

    parser.add_option("--transform-counts",
                      dest="transform_counts",
                      type="choice",
                      choices=("raw", "cumulative"),
                      help="cumulate counts [default=%default].")

    parser.add_option(
        "-s",
        "--segments",
        dest="filename_segments",
        type="string",
        help="filename with segment information [default=%default].")

    parser.add_option("--xrange",
                      dest="xrange",
                      type="string",
                      help="xrange to plot [default=%default]")

    parser.add_option("-o",
                      "--logscale",
                      dest="logscale",
                      type="string",
                      help="use logscale on x, y or xy [default=%default]")

    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action="store_true",
                      help="output plots [default=%default]")

    parser.add_option("--hardcopy",
                      dest="hardcopy",
                      type="string",
                      help="output hardcopies to file [default=%default]")

    parser.add_option("--no-fdr",
                      dest="do_fdr",
                      action="store_false",
                      help="do not compute FDR rates [default=%default]")

    parser.add_option("--segments-format",
                      dest="segments_format",
                      type="choice",
                      choices=("gtf", "bed"),
                      help="format of segments file [default=%default].")

    parser.add_option(
        "--truncate",
        dest="truncate",
        action="store_true",
        help="truncate segments extending beyond a workspace [default=%default]"
    )

    parser.add_option(
        "--remove-overhangs",
        dest="remove_overhangs",
        action="store_true",
        help="remove segments extending beyond a workspace[default=%default]")

    parser.add_option(
        "--keep-ambiguous",
        dest="keep_ambiguous",
        action="store_true",
        help=
        "keep segments extending to more than one workspace [default=%default]"
    )

    parser.set_defaults(
        filename_annotations=None,
        filename_workspace="workspace.gff",
        filename_segments="FastDown.gtf",
        filename_annotations_gtf="../data/tg1_territories.gff",
        workspace_builder="gff",
        workspace_labels="none",
        sampler="permutation",
        truncate=False,
        num_bins=10000,
        num_samples=10,
        resolution=100,
        plot_samples=False,
        plot_envelope=True,
        counters=[],
        transform_counts="raw",
        xrange=None,
        plot=False,
        logscale=None,
        output_all=False,
        do_test=False,
        analysis=[],
        do_fdr=True,
        hardcopy="%s.png",
        segments_format="gtf",
        remove_overhangs=False,
    )

    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ###########################################
    # setup options
    if options.sampler == "permutation":
        sampler = SamplerPermutation
    elif options.sampler == "gaps":
        sampler = SamplerGaps

    if options.xrange:
        options.xrange = map(float, options.xrange.split(","))

    if len(options.counters) == 0:
        raise ValueError("please specify at least one counter.")

    if len(options.analysis) == 0:
        raise ValueError("please specify at least one analysis.")

    if options.workspace_labels == "annotation" and not options.filename_annotations:
        raise ValueError(
            "please specify --annotations-tsv-file is --workspace-labels=annotations."
        )

    ###########################################
    # read data
    if options.workspace_labels == "annotation":

        def constant_factory(value):
            return itertools.repeat(value).next

        def dicttype():
            return collections.defaultdict(constant_factory(("unknown", )))

        map_id2annotations = IOTools.readMultiMap(open(
            options.filename_annotations, "r"),
                                                  dtype=dicttype)
    else:
        map_id2annotations = {}

    workspace = readWorkspace(open(options.filename_workspace,
                                   "r"), options.workspace_builder,
                              options.workspace_labels, map_id2annotations)

    E.info("read workspace for %i contigs" % (len(workspace)))

    indexed_workspace = indexIntervals(workspace, with_values=True)
    segments = readSegments(open(options.filename_segments, "r"),
                            indexed_workspace,
                            format=options.segments_format,
                            keep_ambiguous=options.keep_ambiguous,
                            truncate=options.truncate,
                            remove_overhangs=options.remove_overhangs)

    nsegments = 0
    for contig, vv in segments.iteritems():
        nsegments += len(vv)

    E.info("read %i segments for %i contigs" % (nsegments, len(workspace)))
    indexed_segments = indexIntervals(segments, with_values=False)

    if nsegments == 0:
        E.warn("no segments read - no computation done.")
        E.Stop()
        return

    # build labels
    labels = collections.defaultdict(int)
    for contig, vv in workspace.iteritems():
        for start, end, v in vv:
            for l in v[0]:
                labels[l] += 1
            for l in v[1]:
                labels[l] += 1

    E.info("found %i workspace labels" % len(labels))

    ###########################################
    # setup counting containers
    counters = []
    for cc in options.counters:

        if cc == "transcription":
            counter = CounterTranscription
        elif cc == "closest-distance":
            counter = CounterClosestDistance
        elif cc == "all-distances":
            counter = CounterAllDistances

        if nsegments < 256:
            dtype = numpy.uint8
        elif nsegments < 65536:
            dtype = numpy.uint16
        elif nsegments < 4294967296:
            dtype = numpy.uint32
        else:
            dtype = numpy.int

        E.debug("choosen dtype %s" % str(dtype))

        E.info("samples space is %i bases: %i bins at %i resolution" % (
            options.num_bins * options.resolution,
            options.num_bins,
            options.resolution,
        ))

        E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" %
               (
                   options.num_bins * len(labels) * dtype().itemsize *
                   (options.num_samples + 1),
                   len(labels),
                   options.num_samples,
                   options.num_bins,
               ))

        c = CountingResults(labels)
        c.mObservedCounts = counter(labels,
                                    options.num_bins,
                                    options.resolution,
                                    dtype=dtype)

        simulated_counts = []
        for x in range(options.num_samples):
            simulated_counts.append(
                counter(labels,
                        options.num_bins,
                        options.resolution,
                        dtype=dtype))
        c.mSimulatedCounts = simulated_counts
        c.mName = c.mObservedCounts.mName

        counters.append(c)

        E.info("allocated memory successfully")

    segments_per_workspace = []
    segment_sizes = []
    segments_per_label = collections.defaultdict(int)
    workspaces_per_label = collections.defaultdict(int)

    ############################################
    # get observed and simpulated counts
    nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0
    iteration2 = 0
    for contig, vv in workspace.iteritems():

        iteration2 += 1
        E.info("counting %i/%i: %s %i segments" %
               (iteration2, len(workspace), contig, len(vv)))

        if len(vv) == 0:
            continue

        iteration1 = 0
        for work_start, work_end, v in vv:

            left_labels, right_labels = v[0], v[1]

            iteration1 += 1

            # ignore empty segments
            if contig not in indexed_segments:
                nempty_contigs += 1
                continue

            r = indexed_segments[contig].find(work_start, work_end)
            segments_per_workspace.append(len(r))

            if not r:
                nempty_workspaces += 1
                continue

            # collect segments and stats
            nworkspaces += 1
            observed = [(x.start, x.end) for x in r]
            observed.sort()
            segments_per_workspace.append(len(observed))
            segment_sizes.extend([x[1] - x[0] for x in observed])

            # collect basic counts
            for label in list(left_labels) + list(right_labels):
                workspaces_per_label[label] += 1
                segments_per_label[label] += len(observed)

            # add observed counts
            for counter in counters:
                counter.mObservedCounts.addCounts(observed, work_start,
                                                  work_end, left_labels,
                                                  right_labels)

            # create sampler
            s = sampler(observed, work_start, work_end)

            # add simulated counts
            for iteration in range(options.num_samples):
                simulated = s.sample()
                for counter in counters:
                    counter.mSimulatedCounts[iteration].addCounts(
                        simulated, work_start, work_end, left_labels,
                        right_labels)

    E.info("counting finished")
    E.info(
        "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" %
        (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs))

    ######################################################
    # transform counts

    if options.transform_counts == "cumulative":
        transform = cumulative_transform
    elif options.transform_counts == "raw":
        transform = normalize_transform

    ####################################################
    # analysis

    if "proximity" in options.analysis:
        outfile_proximity = E.openOutputFile("proximity")
        outfile_proximity.write("\t".join(
            ("label", "observed", "pvalue", "expected", "CIlower", "CIupper",
             "qvalue", "segments", "workspaces")) + "\n")
    else:
        outfile_proximity = None

    if "area-under-curve" in options.analysis:
        outfile_auc = E.openOutputFile("auc")
        outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n")
    else:
        outfile_auc = None

    # qvalue: expected false positives at p-value
    # qvalue = expected false positives /
    if options.do_fdr:
        E.info("computing pvalues for fdr")
        for counter in counters:
            for label in labels:
                E.info("working on counter:%s label:%s" % (counter, label))

                # collect all P-Values of simulated results to compute FDR
                sim_pvalues = []
                medians = counter.getMedians(label)

                for median in medians:
                    pvalue = float(
                        scipy.stats.percentileofscore(medians, median)) / 100.0
                    sim_pvalues.append(pvalue)

        sim_pvalues.sort()
    else:
        sim_pvalues = []

    # compute observed p-values
    for counter in counters:
        counter.update()

    obs_pvalues = []
    for counter in counters:
        for label in labels:
            obs_pvalues.append(counter.mStats[label].pvalue)
        obs_pvalues.sort()

    # compute observed p-values
    if options.do_fdr:
        for counter in counters:
            counter.updateFDR(obs_pvalues, sim_pvalues)

    for counter in counters:

        outofbounds_sim, totals_sim = 0, 0
        outofbounds_obs, totals_obs = 0, 0
        for label in labels:
            for sample in range(options.num_samples):
                if counter.mSimulatedCounts[sample].mOutOfBounds[label]:
                    E.debug(
                        "out of bounds: sample %i, label %s, counts=%i" %
                        (sample, label,
                         counter.mSimulatedCounts[sample].mOutOfBounds[label]))
                    outofbounds_sim += counter.mSimulatedCounts[
                        sample].mOutOfBounds[label]
                totals_sim += counter.mSimulatedCounts[sample].mTotals[label]

            outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label]
            totals_obs += counter.mObservedCounts.mTotals[label]

        E.info(
            "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)"
            % (
                outofbounds_obs,
                totals_obs,
                100.0 * outofbounds_obs / totals_obs,
                outofbounds_sim,
                totals_sim,
                100.0 * outofbounds_sim / totals_sim,
            ))

        for label in labels:

            if outfile_auc:
                mmin, mmax, mmean = counter.getEnvelope(
                    label, transform=normalize_transform)
                obs = normalize_transform(
                    counter.mObservedCounts[label],
                    counter.mObservedCounts.mOutOfBounds[label])

                def block_iterator(a1, a2, a3, num_bins):
                    x = 0
                    while x < num_bins:
                        while x < num_bins and a1[x] <= a2[x]:
                            x += 1
                        start = x
                        while x < options.num_bins and a1[x] > a2[x]:
                            x += 1
                        end = x
                        total_a1 = a1[start:end].sum()
                        total_a3 = a3[start:end].sum()
                        if total_a1 > total_a3:
                            yield (total_a1 - total_a3, start, end, total_a1,
                                   total_a3)

                blocks = list(
                    block_iterator(obs, mmax, mmean, options.num_bins))

                if options.output_all:
                    for delta, start, end, total_obs, total_mean in blocks:
                        if end - start <= 1:
                            continue
                        outfile_auc.write(
                            "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                            (label, start * options.resolution,
                             end * options.resolution,
                             (end - start) * options.resolution, total_obs,
                             total_mean, delta, total_obs / total_mean, 100.0 *
                             (total_obs / total_mean - 1.0)))

                # output best block
                blocks.sort()
                delta, start, end, total_obs, total_mean = blocks[-1]

                outfile_auc.write(
                    "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" %
                    (label, start * options.resolution,
                     end * options.resolution,
                     (end - start) * options.resolution, total_obs, total_mean,
                     delta, total_obs / total_mean, 100.0 *
                     (total_obs / total_mean - 1.0)))

            if outfile_proximity:

                # find error bars at median
                st = counter.mStats[label]
                outfile_proximity.write(
                    "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (
                        label,
                        st.observed * options.resolution,
                        st.pvalue,
                        st.expected * options.resolution,
                        st.ci95lower * options.resolution,
                        st.ci95upper * options.resolution,
                        IOTools.val2str(st.qvalue),
                        segments_per_label[label],
                        workspaces_per_label[label],
                    ))

    if options.plot:

        for counter in counters:
            plotCounts(counter, options, transform)

        # plot summary stats
        plt.figure()
        plt.title("distribution of workspace length")
        data = []
        for contig, segs in workspace.iteritems():
            if len(segs) == 0:
                continue
            data.extend([x[1] - x[0] for x in segs])

        vals, bins = numpy.histogram(data,
                                     bins=numpy.arange(0, max(data), 100),
                                     new=True)

        t = float(sum(vals))
        plt.plot(bins[:-1], numpy.cumsum(vals) / t)
        plt.gca().set_xscale('log')
        plt.legend()
        t = float(sum(vals))
        plt.xlabel("size of workspace")
        plt.ylabel("cumulative relative frequency")
        if options.hardcopy:
            plt.savefig(os.path.expanduser(options.hardcopy %
                                           "workspace_size"))

        plt.figure()
        plt.title("segments per block")
        vals, bins = numpy.histogram(segments_per_workspace,
                                     bins=numpy.arange(
                                         0, max(segments_per_workspace), 1),
                                     new=True)
        plt.plot(bins[:-1], vals)
        plt.xlabel("segments per block")
        plt.ylabel("absolute frequency")
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_block"))

        plt.figure()
        plt.title("workspaces per label")
        plt.barh(range(0, len(labels)),
                 [workspaces_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("workspaces per label")
        plt.xlabel("absolute frequency")
        plt.gca().set_xscale('log')

        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "workspaces_per_label"))

        plt.figure()
        plt.title("segments per label")
        plt.barh(range(0, len(labels)),
                 [segments_per_label[x] for x in labels],
                 height=0.5)
        plt.yticks(range(0, len(labels)), labels)
        plt.ylabel("segments per label")
        plt.xlabel("absolute frequency")
        plt.xticks(range(0, len(labels)), labels)
        if options.hardcopy:
            plt.savefig(
                os.path.expanduser(options.hardcopy % "segments_per_label"))

        if not options.hardcopy:
            plt.show()

    E.Stop()

Example #7

Show file

File: gff2annotator.py Project: BioinformaticsArchive/cgat

def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser( version = "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"])

        
    parser.add_option( "-g", "--genome-file", dest="genome_file", type="string",
                       help="filename with genome."  )

    parser.add_option( "-f", "--features", dest="features", type="string", 
                       help="feature to collect [default=None]."  )

    parser.add_option( "-i", "--files", dest="files", action="append",
                       help="use multiple annotations [default=None]."  )

    parser.add_option(  "-a", "--annotations", dest="annotations", type="string", 
                       help="aggregate name for annotations if only single file is provided from STDIN [default=None]."  )

    parser.add_option(  "--input-filename-map", dest="input_filename_map", type="string", 
                       help="filename with a map of gene_ids to categories [default=None]."  )

    parser.add_option(  "--output-filename-synonyms", dest="output_filename_synonyms", type="string", 
                       help="output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]."  )

    parser.add_option( "-m", "--max-length", dest="max_length", type="string", 
                       help="maximum segment length [default=None]."  )

    parser.add_option( "-s", "--section", dest="section", type="choice", 
                       choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff" ),
                       help="annotator section [default=None]."  )

    parser.add_option( "--subset", dest="subsets", type="string", action="append",
                       help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]."  )

    parser.add_option( "--remove-regex", dest="remove_regex", type="string", 
                       help="regular expression of contigs to remove [default=None]."  )

    parser.set_defaults(
        genome_file = None,
        feature = None,
        section = "segments",
        annotations = "annotations",
        max_length = 100000,
        files = [],
        subsets = [],
        input_filename_map = None,
        output_filename_synonyms = None,
        input_format = "gff",
        remove_regex = None,
        )

    (options, args) = E.Start( parser )

    options.files += args
    if len(options.files) == 0: options.files.append("-")
    options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) )

    if options.subsets:
        subsets = collections.defaultdict( list )
        for s in options.subsets: 
            filename_gff,label,filename_ids = s.split( "," )
            subsets[filename_gff].append( (label,filename_ids) )
        options.subsets = subsets

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
    else:
        fasta = None

    if options.section == "segments":
        prefix = "##Segs"
    elif options.section.startswith( "annotations" ):
        prefix = "##Id"
    elif options.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % options.section)
        
    ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0

    if options.remove_regex:
        options.remove_regex = re.compile( options.remove_regex )

    if options.section in ("segments", "workspace"):

        iterator = GTF.iterator_filtered( GFF.iterator( options.stdin ),
                                          feature=options.feature )

        if options.output_filename_synonyms:
            outfile_synonyms = open(options.output_filename_synonyms, "w")
            with_records = True
        else:
            outfile_synonyms = None
            with_records = False

        intervals =GTF.readAsIntervals( iterator, with_records = with_records )
        ninput, nsegments, ndiscarded, ncontigs = \
            PipelineEnrichment.outputSegments( options.stdout,
                                               intervals,
                                               options.section,
                                               outfile_synonyms = outfile_synonyms,
                                               max_length = options.max_length,
                                               remove_regex = options.remove_regex )
            
        if outfile_synonyms:
            outfile_synonyms.close()

    elif options.section == "annotations-go":

        assert options.input_filename_map, "please supply option --input-filename-map" 

        iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ),
                                          feature=options.feature )

        geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") )

        category2segments = collections.defaultdict( list )

        for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():
            if options.remove_regex and options.remove_regex.search( contig ): continue
            
            for start, end, geneid in gffs:
                if geneid not in geneid2categories: continue
                for category in geneid2categories[geneid]:
                    category2segments[category].append(nsegments)

                options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                nsegments += 1                        
            
        for category, segments in category2segments.iteritems():
            options.stdout.write("##Ann\t%s\t%s\n" % (category, "\t".join( ["%i" % x for x in segments ] ) ) )
            E.info( "set %s annotated with %i segments" % (category, len(segments)) )

    elif options.section == "annotations":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments
            is_gtf = False

            if filename == "-":
                iterator = GTF.iterator_filtered( GFF.iterator( sys.stdin ),
                                                  feature=options.feature )
                filename = options.annotations
            elif filename.endswith(".gtf"):
                is_gtf = True
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                                      feature=options.feature )
                
            else:
                with open( filename, "r") as infile:
                    iterator = GTF.iterator_filtered( GFF.iterator( infile ),
                                                      feature=options.feature )
           
            E.debug("processing %s" % (filename))

            if not options.subsets or filename not in options.subsets:
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                raise ValueError("don't know how to filter %s" % filename )

    elif options.section == "annotations-gff":

        for filename in options.files:
            if filename == "-":
                iterator = GTF.iterator( sys.stdin )
            else:
                iterator = GTF.iterator_filtered( GFF.iterator( open( filename, "r") ) )

            segments = collections.defaultdict( list )
            for gff in iterator:
                segments[":".join((gff.source,gff.feature))].append( (gff.contig,gff.start, gff.end) )
  
            feature2segments = {}

            for feature, s in segments.iteritems():
                s.sort()

                s1 = nsegments

                for contig, start, end in s:
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                    nsegments += 1

                feature2segments[feature] = (s1, nsegments)
            
        for feature, id_range in feature2segments.iteritems():
            start, end = id_range
            options.stdout.write("##Ann\t%s\t%s\n" % (feature, "\t".join( ["%i" % x for x in xrange( start,end) ] ) ) )
            E.info( "set %s annotated with %i segments" % (feature, end-start) )

    elif options.section == "annotations-genes":

        for filename in options.files:

            E.info( "adding filename %s" % filename )

            start = nsegments

            assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \
                "requiring .gtf files for gene list filtering, received %s" % filename

            infile = IOTools.openFile( filename )
            iterator = GTF.iterator_filtered( GTF.iterator( infile ),
                                              feature=options.feature )
                
            E.debug("processing %s" % (filename))
            
            if not options.subsets or filename not in options.subsets:
                ## output all
                for contig, gffs in GTF.readAsIntervals( iterator ).items():
                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for x in gffs:
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) )
                        nsegments += 1

                options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) )
                E.info( "set %s annotated with %i segments" % (filename, nsegments - start) )

            else:
                ## create subsets
                E.debug("applying subsets for %s" % filename )
                geneid2label, label2segments = collections.defaultdict(list) , {}
                for label, filename_ids in options.subsets[filename]:
                    gene_ids = IOTools.readList( open(filename_ids, "r") )
                    for gene_id in gene_ids: geneid2label[gene_id].append( label )
                    label2segments[label] = []

                for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items():

                    if options.remove_regex and options.remove_regex.search( contig ): continue

                    for start, end, gene_id in gffs:
                        if gene_id not in geneid2label: continue
                        for label in geneid2label[gene_id]:
                            label2segments[label].append(nsegments)
                            
                        options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) )
                        nsegments += 1                        
                        
                for label, segments in label2segments.iteritems():
                    options.stdout.write("##Ann\t%s\t%s\n" % (label, "\t".join( ["%i" % x for x in segments ] ) ) )
                    E.info( "set %s (%s) annotated with %i segments" % (label, filename, len(segments)) )

    E.info( "ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded))

    E.Stop()