Python GTF.transcript_iterator Beispiele

Programmiersprache: Python

Namespace / Paketname: cgat

Klasse / Typ: GTF

Methode / Funktion: transcript_iterator

Beispiele auf hotexamples.com: 9

Python GTF.transcript_iterator - 9 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die cgat.GTF.transcript_iterator, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

iterator(30)

Entry(18)

transcript_iterator(9)

gene_iterator(6)

readFromFile(4)

asRanges(3)

iterator_filtered(3)

joined_iterator(3)

readAsIntervals(3)

flat_gene_iterator(2)

readAndIndex(2)

iterator_overlaps(2)

chunk_iterator(1)

SortPerContig(1)

Overlap(1)

iterator_sorted(1)

quote(1)

toIntronIntervals(1)

Beispiel #1

Datei anzeigen

def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = iotools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)

Beispiel #2

Datei anzeigen

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help=
        "Ignore transcripts on contigs that are not in the genome-file [default=%default]."
    )

    parser.add_option(
        "--min-intron-length",
        dest="min_intron_length",
        type="int",
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("full", ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    annotateGenome(iterator, fasta, options)

    # write footer and output benchmark information.
    E.stop()

Beispiel #3

Datei anzeigen

Datei: bams2bam.py Projekt: alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--gtf-file", dest="filename_gtf", type="string",
        help="filename with gene models in gtf format [%default]")

    parser.add_option(
        "-m", "--filename-mismapped", dest="filename_mismapped", type="string",
        help="output bam file for mismapped reads [%default]")

    parser.add_option(
        "-j", "--junctions-bed-file", dest="filename_junctions", type="string",
        help="bam file with reads mapped across junctions [%default]")

    parser.add_option(
        "-r", "--filename-regions", dest="filename_regions", type="string",
        help="filename with regions to remove in bed format [%default]")

    parser.add_option(
        "-t", "--transcripts-gtf-file", dest="filename_transcriptome",
        type="string",
        help="bam file with reads mapped against transcripts [%default]")

    parser.add_option(
        "-p", "--map-tsv-file", dest="filename_map", type="string",
        help="filename mapping transcript numbers (used by "
        "--filename-transciptome) to transcript names "
        "(used by --filename-gtf) [%default]")

    parser.add_option(
        "-s", "--filename-stats", dest="filename_stats", type="string",
        help="filename to output stats to [%default]")

    parser.add_option(
        "-o", "--colour",
        dest="colour_mismatches", action="store_true",
        help="mismatches will use colour differences (CM tag) [%default]")

    parser.add_option(
        "-i", "--ignore-mismatches",
        dest="ignore_mismatches", action="store_true",
        help="ignore mismatches [%default]")

    parser.add_option(
        "-c", "--remove-contigs", dest="remove_contigs", type="string",
        help="','-separated list of contigs to remove [%default]")

    parser.add_option(
        "-f", "--force-output", dest="force", action="store_true",
        help="force overwriting of existing files [%default]")

    parser.add_option("-u", "--unique", dest="unique", action="store_true",
                      help="remove reads not matching uniquely [%default]")

    parser.add_option("--output-sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.set_defaults(
        filename_gtf=None,
        filename_mismapped=None,
        filename_junctions=None,
        filename_transcriptome=None,
        filename_map=None,
        remove_contigs=None,
        force=False,
        unique=False,
        colour_mismatches=False,
        ignore_mismatches=False,
        output_sam=False,
        filename_table=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 1:
        raise ValueError("please supply one bam file")

    bamfile_genome = args[0]
    genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb")

    if options.remove_contigs:
        options.remove_contigs = options.remove_contigs.split(",")

    if options.filename_map:
        E.info("reading map")
        id_map = iotools.read_map(
            iotools.open_file(options.filename_map), has_header=True)
        id_map = dict([(y, x) for x, y in id_map.items()])
    else:
        id_map = None

    transcripts = {}
    if options.filename_gtf:
        E.info("indexing geneset")
        mapped, missed = 0, 0
        for gtf in GTF.transcript_iterator(
                GTF.iterator(iotools.open_file(options.filename_gtf))):
            gtf.sort(key=lambda x: x.start)
            transcript_id = gtf[0].transcript_id
            if id_map:
                try:
                    transcript_id = id_map[transcript_id]
                    mapped += 1
                except KeyError:
                    missed += 1
                    continue
            transcripts[transcript_id] = gtf

        E.info("read %i transcripts from geneset (%i mapped, %i missed)" %
               (len(transcripts), mapped, missed))

    regions_to_remove = None
    if options.filename_regions:
        E.info("indexing regions")
        regions_to_remove = IndexedGenome.Simple()
        for bed in Bed.iterator(iotools.open_file(options.filename_regions)):
            regions_to_remove.add(bed.contig, bed.start, bed.end)
        E.info("read %i regions" % len(regions_to_remove))

    if options.filename_transcriptome:
        transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome,
                                                  "rb")
    else:
        transcripts_samfile = None

    if options.output_sam:
        output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile)
    else:
        output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile)

    if options.filename_mismapped:
        if not options.force and os.path.exists(options.filename_mismapped):
            raise IOError("output file %s already exists" %
                          options.filename_mismapped)
        output_mismapped = pysam.AlignmentFile(options.filename_mismapped,
                                               "wb",
                                               template=genome_samfile)
    else:
        output_mismapped = None

    if options.filename_junctions:
        junctions_samfile = pysam.AlignmentFile(options.filename_junctions,
                                                "rb")
    else:
        junctions_samfile = None

    c = bams2bam_filter(genome_samfile,
                        output_samfile,
                        output_mismapped,
                        transcripts_samfile,
                        junctions_samfile,
                        transcripts,
                        regions=regions_to_remove,
                        unique=options.unique,
                        remove_contigs=options.remove_contigs,
                        colour_mismatches=options.colour_mismatches,
                        ignore_mismatches=options.ignore_mismatches,
                        ignore_transcripts=transcripts_samfile is None,
                        ignore_junctions=junctions_samfile is None)

    if options.filename_stats:
        outf = iotools.open_file(options.filename_stats, "w")
        outf.write("category\tcounts\n%s\n" % c.asTable())
        outf.close()

    if options.filename_transcriptome:
        transcripts_samfile.close()

    genome_samfile.close()
    output_samfile.close()
    if output_mismapped:
        output_mismapped.close()

    # write footer and output benchmark information.
    E.stop()

Beispiel #4

Datei anzeigen

Datei: gff2bed.py Projekt: harmeet1990/cgat-apps

def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input file is in gtf format")

    parser.add_argument("--set-name",
                        dest="name",
                        type=str,
                        help="field from the GFF/GTF file to use as the "
                        "name field in the BED file ",
                        choices=("gene_id", "transcript_id", "class", "family",
                                 "feature", "source", "repName",
                                 "gene_biotype"))

    parser.add_argument("--track",
                        dest="track",
                        type=str,
                        choices=("feature", "source", None),
                        help="use feature/source field to define BED tracks ")

    parser.add_argument(
        "--bed12-from-transcripts",
        dest="bed12",
        action="store_true",
        default=False,
        help="Process GTF file into Bed12 entries, with blocks as exons"
        "and thick/thin as coding/non-coding")

    parser.set_defaults(track=None, name="gene_id", is_gtf=False)

    (args) = E.start(parser, add_pipe_options=True)

    ninput, noutput = 0, 0

    iterator = GTF.iterator(args.stdin)

    if args.bed12:
        iterator = GTF.transcript_iterator(iterator)

    if args.track:
        all_input = list(iterator)

        if args.track == "feature":
            grouper = lambda x: x.feature
        elif args.track == "source":
            grouper = lambda x: x.source

        all_input.sort(key=grouper)

        bed = Bed.Bed()
        for key, vals in itertools.groupby(all_input, grouper):
            args.stdout.write("track name=%s\n" % key)
            for gff in vals:
                ninput += 1

                if args.bed12:
                    bed = transcript2bed12(gff)
                else:
                    bed.fromGTF(gff, name=args.name)

                args.stdout.write(str(bed) + "\n")
                noutput += 1

    else:
        bed = Bed.Bed()
        for gff in iterator:
            ninput += 1

            if args.bed12:
                bed = transcript2bed12(gff)
            else:
                bed.fromGTF(gff, name=args.name)

            args.stdout.write(str(bed) + "\n")

            noutput += 1

    E.info("ninput=%i, noutput=%i" % (ninput, noutput))
    E.stop()

Beispiel #5

Datei anzeigen

Datei: gff2fasta.py Projekt: alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with iotools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = quicksect.IntervalTree()
            for start, end in e[contig]:
                intersector.add(start, end)
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(
                                           quicksect.Interval(start, end))]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()

Beispiel #6

Datei anzeigen

Datei: gtf2fasta.py Projekt: harmeet1990/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome")

    parser.add_argument(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help="Ignore transcripts on contigs that are not in the genome-file.")

    parser.add_argument(
        "--min-intron-length",
        dest="min_intron_length",
        type=int,
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown"
    )

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=["full"],
                        help="method to apply")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    if not args.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(args.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(args.stdin))

    annotateGenome(iterator, fasta, args)

    # write footer and output benchmark information.
    E.stop()

Beispiel #7

Datei anzeigen

Datei: find_utrons.py Projekt: jjriley1/pipeline_utrons

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $1.0$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--reffile",
                      dest="reffile",
                      type="string",
                      help="Supply reference gtf file name")

    parser.add_option("-d",
                      "--class-file",
                      dest="classfile",
                      type="string",
                      help="Supply database name")

    parser.add_option("-o",
                      "--outfile",
                      dest="outfile",
                      type="string",
                      help="Supply output bed file name")

    parser.add_option("-u",
                      "--indivfile",
                      dest="indivfile",
                      type="string",
                      help="Supply output bed file name for individual utrons")

    parser.add_option("-p",
                      "--partfile",
                      dest="partfile",
                      type="string",
                      help="Supply output bed file name for partnered utrons")
    parser.add_option(
        "-q",
        "--indivpartfile",
        dest="indivpartfile",
        type="string",
        help="Supply output bed file name for individual partnered utrons")
    parser.add_option("-n",
                      "--novel-file",
                      dest="novelfile",
                      type="string",
                      help="Supply output bed file name for novel introns")
    parser.add_option(
        "--novel-transcript",
        dest="novel_id",
        type="string",
        help="DEBUG: Output info for this transcript from the STDIN")
    parser.add_option(
        "--target-transcript",
        dest="target_id",
        type="string",
        help="DEBUG: Output info for this transcript from ref-file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    outlines = []
    individuals = []
    partnered = []
    individualpartnered = []
    novel = []

    db = pandas.read_csv(options.classfile, sep="\t")

    # This keeps just one entry per-transcript - why?
    #db = db.groupby("transcript_id").first()
    db = db.set_index("transcript_id")
    enshashtable = getGeneTable(options.reffile)

    for novel_transcript in GTF.transcript_iterator(GTF.iterator(
            options.stdin)):

        # Why do it on a gene by gene basis rather than transcript by transcript basis?
        transcript_id = novel_transcript[0].transcript_id

        if transcript_id == options.novel_id:
            output_novel = True
        else:
            output_novel = False

        try:
            geneid = db.loc[transcript_id].match_gene_id
        except KeyError:
            if output_novel:
                E.debug("Transcript %s not in class table" % transcript_id)
            continue

        if pandas.isnull(geneid):
            if output_novel:
                E.debug("Transcript %s matches no gene in class table" %
                        transcript_id)
            continue

        ens_gene = enshashtable[geneid]

        all_ref_introns = set()
        novel_transcript_exons = GTF.asRanges(novel_transcript, "exon")
        novel_transcript_introns = GTF.toIntronIntervals(novel_transcript)
        for ref_transcript in ens_gene["models"].values():
            ref_introns = GTF.toIntronIntervals(ref_transcript)
            all_ref_introns.update(ref_introns)

        #Identify comparison set
        def _in_exon(position, exons):
            return any(e[0] <= position <= e[1] for e in exons)

        # check if this ever gets the wrong start_codon.
        filtered_starts = [
            s for s in ens_gene["start_codons"]
            if _in_exon(s, novel_transcript_exons)
        ]

        if len(filtered_starts) == 0:
            if output_novel:
                E.debug("No starts found for %s" % transcript_id)
            continue

        #if novel_transcript[0].strand == "-":
        #    selected_start = max(filtered_starts)
        #else:
        #    selected_start = min(filtered_starts)

        selected_models = list()
        for startc in filtered_starts:
            selected_models.extend(ens_gene["start_codons"][startc])

        if output_novel:
            E.debug("Transcripts with compatible starts are %s" %
                    selected_models)

        for ref_transcript_id in selected_models:

            if output_novel and ref_transcript_id == options.target_id:
                output_ref = True
            else:
                output_ref = False

            second = ens_gene["models"][ref_transcript_id]
            ens_CDS = GTF.asRanges(second, "CDS")

            if len(ens_CDS) == 0:
                if output_ref:
                    E.debug("%s is not coding"
                            )  # ensure only protein-coding transcripts
                continue

            ens_exons = GTF.asRanges(second, "exon")

            first_introns = set(novel_transcript_introns)
            second_introns = set(GTF.toIntronIntervals(second))

            first_CDSintrons = [
                intron for intron in first_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            second_CDSintrons = [
                intron for intron in second_introns
                if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1])
            ]

            first_CDSintrons = set(first_CDSintrons)
            second_CDSintrons = set(second_CDSintrons)

            if not first_CDSintrons == second_CDSintrons:
                if output_ref:
                    E.debug("CDS chains do not match. Chains are:")
                    first_CDSintrons = sorted(list(first_CDSintrons))
                    second_CDSintrons = sorted(list(second_CDSintrons))
                    output = "\n".join(
                        map(str, zip(first_CDSintrons, second_CDSintrons)))
                    E.debug(output)
                continue  # match CDS intron chain

            firstUTRintrons = first_introns - first_CDSintrons

            if len(firstUTRintrons) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            secondUTRintrons = second_introns - second_CDSintrons

            found = False
            for intron in first_introns:
                if (intron[0] < ens_CDS[-1][1] and
                    intron[1] > ens_CDS[-1][1]) or \
                    (intron[0] < ens_CDS[0][0] and
                     intron[1] > ens_CDS[0][0]):

                    found = True
                    break  # ensure pruned transcript doesn't have
                    # introns overlapping start or stop codons in ensembl
                    # transcript
            if found:
                if output_ref:
                    E.debug("Start or stop in intron")
                continue

            if second[0].strand == "+":
                ens_stop = ens_CDS[-1][1]
                UTR3introns = [
                    intron for intron in firstUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons
                    if intron[0] >= ens_CDS[-1][1]
                    and intron[1] < ens_exons[-1][1]
                ]
            else:
                ens_stop = ens_CDS[0][0]
                UTR3introns = [
                    intron for intron in firstUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]
                secondUTR3introns = [
                    intron for intron in secondUTRintrons if
                    intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0]
                ]

            if len(UTR3introns) == 0:
                if output_ref:
                    E.debug("No UTR introns")
                continue

            outbed = Bed.Bed()
            outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.']
            outbed.fromIntervals(UTR3introns)
            outbed.contig = novel_transcript[0].contig
            outbed["name"] = novel_transcript[0].transcript_id
            outbed["strand"] = novel_transcript[0].strand
            outlines.append(outbed)  # get output for each transcript

            for item in UTR3introns:
                outbed2 = Bed.Bed()
                outbed2.fields = ['.', '.', '.', '.']
                outbed2.fromIntervals([item])
                outbed2.contig = novel_transcript[0].contig
                outbed2['name'] = novel_transcript[0].transcript_id
                outbed2["strand"] = novel_transcript[0].strand
                outbed2["thickStart"] = ens_stop
                individuals.append(outbed2)  # get output for each intron

            UTR3introns = set(UTR3introns)
            secondUTR3introns = set(secondUTR3introns)
            extraUTR3introns = list(UTR3introns - secondUTR3introns)

            if output_ref and len(secondUTR3introns - UTR3introns) > 0:
                E.debug("Following introns in UTR of %s but not %s" %
                        (options.target_id, options.novel_id))
                E.debug(secondUTRintrons - UTR3introns)

            # get only introns that are not in matched transcript
            if len(extraUTR3introns) != 0 and len(secondUTR3introns -
                                                  UTR3introns) == 0:
                outbed3 = Bed.Bed()
                outbed3.fields = ['.'] * 9
                outbed3.fromIntervals(extraUTR3introns)
                outbed3.contig = novel_transcript[0].contig
                outbed3["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed3["strand"] = novel_transcript[0].strand
                partnered.append(outbed3)

                for item in extraUTR3introns:
                    outbed4 = Bed.Bed()
                    outbed4.fields = ['.', '.', '.', '.']
                    outbed4.fromIntervals([item])
                    outbed4.contig = novel_transcript[0].contig
                    outbed4["name"] = novel_transcript[
                        0].transcript_id + ":" + second[0].transcript_id
                    outbed4["strand"] = novel_transcript[0].strand
                    outbed4["thickStart"] = ens_stop
                    individualpartnered.append(outbed4)

            if len(all_ref_introns) == 0:
                ens_starts, ens_ends = [], []
            else:
                ens_starts, ens_ends = zip(*all_ref_introns)

            novelEvents = [
                i for i in UTR3introns
                if i[0] not in ens_starts and i[1] not in ens_ends
            ]

            for item in novelEvents:
                outbed5 = Bed.Bed()
                outbed5.fields = ['.'] * 4
                outbed5.fromIntervals([item])
                outbed5.contig = novel_transcript[0].contig
                outbed5["name"] = novel_transcript[
                    0].transcript_id + ":" + second[0].transcript_id
                outbed5["strand"] = novel_transcript[0].strand
                outbed5["thickStart"] = ens_stop
                novel.append(outbed5)

    with IOTools.open_file(options.outfile, "w") as outf:
        for line in outlines:
            outf.write(str(line) + "\n")

    if options.indivfile is not None:
        with IOTools.open_file(options.indivfile, "w") as outf2:
            for line in individuals:
                outf2.write(str(line) + "\n")

    if options.partfile is not None:
        with IOTools.open_file(options.partfile, "w") as outf3:
            for line in partnered:
                outf3.write(str(line) + "\n")

    if options.indivpartfile is not None:
        with IOTools.open_file(options.indivpartfile, "w") as outf4:
            for line in individualpartnered:
                outf4.write(str(line) + "\n")

    if options.novelfile is not None:
        with IOTools.open_file(options.novelfile, "w") as outf5:
            for line in novel:
                outf5.write(str(line) + "\n")
    # write footer and output benchmark information.
    E.stop()

Beispiel #8

Datei anzeigen

Datei: gff2psl.py Projekt: alphaneer/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gffs[0].transcript_id
        entry.mSbjctId = gffs[0].contig
        entry.strand = gffs[0].strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()

Beispiel #9

Datei anzeigen

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
        """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "geneprofile",
                          "tssprofile",
                          "utrprofile",
                          "intervalprofile",
                          "midpointprofile",
                          "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "separateexonprofile",
                          "separateexonprofilewithintrons",
                      ),
                      help='counters to use. Counters describe the '
                      'meta-gene structure to use. '
                      'Note using geneprofilewithintrons, or '
                      'geneprofileabsolutedistancefromthreeprimeend will '
                      'automatically turn on the --use-base-accuracy option'
                      '[%default].')

    parser.add_option("-b",
                      "--bam-file",
                      "--bedfile",
                      "--bigwigfile",
                      dest="infiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="BAM/bed/bigwig files to use. Do not mix "
                      "different types [%default]")

    parser.add_option("-c",
                      "--control-bam-file",
                      dest="controlfiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="control/input to use. Should be of the same "
                      "type as the bam/bed/bigwig file"
                      " [%default]")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option("--normalize-transcript",
                      dest="transcript_normalization",
                      type="choice",
                      choices=("none", "max", "sum", "total-max", "total-sum"),
                      help="normalization to apply on each transcript "
                      "profile before adding to meta-gene profile. "
                      "[%default]")

    parser.add_option("--normalize-profile",
                      dest="profile_normalizations",
                      type="choice",
                      action="append",
                      choices=("all", "none", "area", "counts", "background"),
                      help="normalization to apply on meta-gene "
                      "profile normalization. "
                      "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option("-i",
                      "--shift-size",
                      dest="shifts",
                      type="int",
                      action="append",
                      help="shift reads in :term:`bam` formatted file "
                      "before computing densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-a",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge pairs in :term:`bam` formatted "
                      "file before computing "
                      "densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-u",
                      "--use-base-accuracy",
                      dest="base_accuracy",
                      action="store_true",
                      help="compute densities with base accuracy. The default "
                      "is to only use the start and end of the aligned region "
                      "(RNA-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extends",
                      type="int",
                      action="append",
                      help="extend reads in :term:`bam` formatted file "
                      "(ChIP-Seq). "
                      "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-first-exon",
                      dest="resolution_first",
                      type="int",
                      help="resolution of first exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-last-exon",
                      dest="resolution_last",
                      type="int",
                      help="resolution of last exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option("--resolution-exons-absolute-distance-topolya",
                      dest="resolution_exons_absolute_distance_topolya",
                      type="int",
                      help="resolution of exons absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--resolution-introns-absolute-distance-topolya",
                      dest="resolution_introns_absolute_distance_topolya",
                      type="int",
                      help="resolution of introns absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--extension-exons-absolute-distance-topolya",
                      dest="extension_exons_absolute_distance_topolya",
                      type="int",
                      help="extension for exons from the absolute "
                      "distance from the topolya in bp "
                      "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help="extension for introns from the absolute distance from "
        "the topolya in bp [%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and foreground data. "
        "Computed from data if not set. "
        "[%default]")

    parser.add_option("--output-all-profiles",
                      dest="output_all_profiles",
                      action="store_true",
                      help="keep individual profiles for each "
                      "transcript and output. "
                      "[%default]")

    parser.add_option("--counts-tsv-file",
                      dest="input_filename_counts",
                      type="string",
                      help="filename with count data for each transcript. "
                      "Use this instead "
                      "of recomputing the profile. Useful for plotting the "
                      "meta-gene profile "
                      "from previously computed counts "
                      "[%default]")

    parser.add_option(
        "--background-region-bins",
        dest="background_region_bins",
        type="int",
        help="number of bins on either end of the profile "
        "to be considered for background meta-gene normalization "
        "[%default]")

    parser.add_option(
        "--output-res",
        dest="resolution_images",
        type="int",
        help="the output dpi for the figure plot - will default to "
        "[%default]")

    parser.add_option(
        "--image-format",
        dest="image_format",
        type="string",
        help="The output format for the figure plot - defaults to "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        # 3kb is a good balance of seeing long enough 3 prime bias and not omit
        # too many genes. Tim 31th Aug 2013
        resolution_exons_absolute_distance_topolya=3000,
        # introns is only for assess the noise level, thus do ont need a long
        # region, a long region has the side effect of omit more genes. Tim
        # 31th Aug 2013
        resolution_introns_absolute_distance_topolya=500,
        # extension can simply just be the same as resolution
        extension_exons_absolute_distance_topolya=3000,
        extension_introns_absolute_distance_topolya=500,
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        resolution_first=1000,
        resolution_last=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        transcript_normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region_bins=10,
        input_filename_counts=None,
        resolution_images=None,
        image_format="png",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = iotools.open_file(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the
        # spliced out introns or exons appear to be covered by
        # non-existent reads, it is better you let those methods imply
        # --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.AlignmentFile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.shifts or options.extends:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            range_counter = geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            range_counter = geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      str(options.infiles))

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            # options.extension_exons_absolute_distance_tostartsite,
            # options.extension_introns_absolute_distance_tostartsite,
            # Tim 31th Aug 2013: a possible feature for future,  if five prime
            # bias is of your interest.
            # (you need to create another class). It is not very difficult to
            # derive from this class, but is not implemented yet
            # This future feature is slightly different the TSS profile
            # already implemented, because in this future feature introns are
            # skipped,
            counters.append(
                geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter, options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    options.extension_upstream, options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                geneprofile.TSSCounter(range_counter,
                                       options.extension_outward,
                                       options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                geneprofile.RegionCounter(range_counter,
                                          options.resolution_upstream,
                                          options.resolution_cds,
                                          options.resolution_downstream,
                                          options.extension_upstream,
                                          options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                geneprofile.MidpointCounter(range_counter,
                                            options.resolution_upstream,
                                            options.resolution_downstream,
                                            options.extension_upstream,
                                            options.extension_downstream))

        # add new method to split 1st and last exons out
        # requires a representative transcript for reach gene
        # gtf should be sorted gene-position
        elif method == "separateexonprofile":
            counters.append(
                geneprofile.SeparateExonCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream))

        elif method == "separateexonprofilewithintrons":
            counters.append(
                geneprofile.SeparateExonWithIntronCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.transcript_normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                iotools.open_file(
                    E.get_output_file(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(iotools.open_file(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        feature_names = geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        profiles = []
        for norm in options.profile_normalizations:
            # build matrix, apply normalization
            profile = counter.getProfile(
                normalize=norm,
                background_region_bins=options.background_region_bins)
            profiles.append(profile)

        for x in range(1, len(profiles)):
            assert profiles[0].shape == profiles[x].shape

        # build a single matrix of all profiles for output
        matrix = numpy.concatenate(profiles)
        matrix.shape = len(profiles), len(profiles[0])
        matrix = matrix.transpose()

        with iotools.open_file(
                E.get_output_file(counter.name) + ".matrix.tsv.gz",
                "w") as outfile:
            outfile.write("bin\tregion\tregion_bin\t%s\n" %
                          "\t".join(options.profile_normalizations))
            fields = []
            bins = []
            for field, nbins in zip(counter.fields, counter.nbins):
                fields.extend([field] * nbins)
                bins.extend(list(range(nbins)))

            for row, cols in enumerate(zip(fields, bins, matrix)):
                outfile.write("%i\t%s\t" %
                              (row, "\t".join([str(x) for x in cols[:-1]])))
                outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]])))

        with iotools.open_file(
                E.get_output_file(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile",
                          "separateexonprofile",
                          "separateexonprofilewithintrons"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(6, 1, x + 1)
                    plt.plot(list(range(len(counts))), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.get_output_file(figname) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(list(range(len(points))), points)

                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.get_output_file(figname) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.get_output_file(
                    counter.name) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.get_output_file(
                    counter.name) + "." + options.image_format
                plt.savefig(os.path.expanduser(fn),
                            format=options.image_format,
                            dpi=options.resolution_images)

    # write footer and output benchmark information.
    E.stop()