Python GTF.readAsIntervals Examples

Programming Language: Python

Namespace/Package Name: cgat

Class/Type: GTF

Method/Function: readAsIntervals

Examples at hotexamples.com: 3

Python GTF.readAsIntervals - 3 examples found. These are the top rated real world Python examples of cgat.GTF.readAsIntervals extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

iterator(30)

Entry(18)

transcript_iterator(9)

gene_iterator(6)

readFromFile(4)

asRanges(3)

iterator_filtered(3)

joined_iterator(3)

readAsIntervals(3)

flat_gene_iterator(2)

readAndIndex(2)

iterator_overlaps(2)

chunk_iterator(1)

SortPerContig(1)

Overlap(1)

iterator_sorted(1)

quote(1)

toIntronIntervals(1)

Example #1

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome (indexed).")

    parser.add_argument("-w",
                        "--windows-bed-file",
                        dest="filename_windows",
                        type=str,
                        help="gff file with windows to use.")

    parser.add_argument("-d",
                        "--filename-data",
                        dest="filename_data",
                        type=str,
                        help="gff file with data to use.")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="filename-data is gtf file")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        action="append",
                        choices=("GC", ),
                        help="features to compute.")

    parser.add_argument("-c",
                        "--decorator",
                        dest="decorator",
                        type=str,
                        choices=("counts", "gc", "gc3", "mean-length",
                                 "median-length", "percent-coverage",
                                 "median-score", "mean-score", "stddev-score",
                                 "min-score", "max-score"),
                        help="decorators to use.")

    parser.add_argument("-e",
                        "--skip-empty",
                        dest="skip_empty",
                        action="store_true",
                        help="skip empty windows.")

    parser.add_argument(
        "-t",
        "--transform=",
        dest="transform",
        type=str,
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (args) = E.start(parser)

    #    test_transform_third_codon()

    if not args.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if args.loglevel >= 1:
        args.stdlog.write("# reading windows...")
        args.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(iotools.open_file(args.filename_windows, "r")))

    if args.loglevel >= 1:
        args.stdlog.write("done\n")
        args.stdlog.flush()

    if args.filename_data:
        if args.loglevel >= 1:
            args.stdlog.write("# reading data...")
            args.stdlog.flush()

        if args.is_gtf:
            gff_data = GTF.readFromFile(
                iotools.open_file(args.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(args.filename_data, "r"))

        if args.loglevel >= 1:
            args.stdlog.write("done\n")
            args.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        args.transform = "complement"

    map_contig2size = {}

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    args.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                args)
        else:
            annotateWindows(contig, windows[contig], [], fasta, args)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()

Example #2

Show file

File: gff2fasta.py Project: alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with iotools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = quicksect.IntervalTree()
            for start, end in e[contig]:
                intersector.add(start, end)
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(
                                           quicksect.Interval(start, end))]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()

Example #3

Show file

def cropGFF(gffs, filename_gff):
    """crop intervals in gff file."""

    # read regions to crop with and convert intervals to intersectors
    E.info("reading gff for cropping: started.")

    other_gffs = GTF.iterator(iotools.open_file(filename_gff, "r"))

    cropper = GTF.readAsIntervals(other_gffs)

    ntotal = 0
    for contig in list(cropper.keys()):
        intersector = quicksect.IntervalTree()
        for start, end in cropper[contig]:
            intersector.add(start, end)
            ntotal += 1
        cropper[contig] = intersector

    E.info("reading gff for cropping: finished.")
    E.info("reading gff for cropping: %i contigs with %i intervals." %
           (len(cropper), ntotal))

    ninput, noutput, ncropped, ndeleted = 0, 0, 0, 0

    # do the actual cropping
    for gff in gffs:

        ninput += 1

        if gff.contig in cropper:

            start, end = gff.start, gff.end
            overlaps = cropper[gff.contig].find(quicksect.Interval(start, end))

            if overlaps:

                l = end - start
                a = numpy.ones(l)
                for i in overlaps:
                    s = max(0, i.start - start)
                    e = min(l, i.end - start)
                    a[s:e] = 0

                segments = Intervals.fromArray(a)
                if len(segments) == 0:
                    ndeleted += 1
                else:
                    ncropped += 1

                for s, e in segments:
                    gff.start, gff.end = s + start, e + start
                    noutput += 1
                    yield (gff)

                continue

        noutput += 1

        yield (gff)

    E.info("ninput=%i, noutput=%i, ncropped=%i, ndeleted=%i" %
           (ninput, noutput, ncropped, ndeleted))