コード例 #1
0
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start)
                     for start, end in intervals]
        intervals.reverse()

    s = [
        fasta.getSequence(contig, strand, start, end)
        for start, end in intervals
    ]

    return "".join(s)
コード例 #2
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def toSequence(chunk, fasta):
    """convert a list of gff attributes to a single sequence.

    This function ensures correct in-order concatenation on
    positive/negative strand. Overlapping regions are merged.
    """
    if len(chunk) == 0:
        return ""

    contig, strand = chunk[0].contig, chunk[0].strand

    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk])
    lcontig = fasta.getLength(contig)
    positive = Genomics.IsPositiveStrand(strand)

    if not positive:
        intervals = [(lcontig - end, lcontig - start) for start, end in intervals]
        intervals.reverse()

    s = [fasta.getSequence(contig, strand, start, end) for start, end in intervals]

    return "".join(s)
コード例 #3
0
ファイル: beds2beds.py プロジェクト: wangdi2014/cgat
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)

    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in list(data_per_contig.keys()):
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in sorted(data_per_contig.items()):
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
コード例 #4
0
ファイル: beds2beds.py プロジェクト: Charlie-George/cgat
def combineMergedIntervals(bedfiles):
    '''combine intervals in a collection of bed files.

    Overlapping intervals between tracks are merged.

    Algorithm:

    1. collect all intervals in all tracks into a single track
    2. merge overlapping intervals 
    3. report all intervals that overlap with an interval in each track.

    '''

    # get all intervals
    data_per_contig = collections.defaultdict(list)
    for bedfile in bedfiles:
        for contig in bedfile.contigs:
            i = []
            for bed in bedfile.fetch(contig, parser=pysam.asBed()):
                i.append((bed.start, bed.end))
            data_per_contig[contig].extend(i)

    # merge intervals
    for contig in data_per_contig.keys():
        data_per_contig[contig] = Intervals.combine(data_per_contig[contig])

    # filter intervals - take only those present in all bedfiles
    for contig, data in data_per_contig.iteritems():
        for start, end in data:
            if isContainedInAll(contig, start, end, bedfiles):
                yield contig, start, end
コード例 #5
0
ファイル: gtf2gff.py プロジェクト: yangjl/cgat
def annotateRegulons( iterator, fasta, tss, options ):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator( iterator )

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand )
        lcontig = fasta.getLength( gene[0][0].contig )
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] )
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = ( min( lcontig, max( 0, interval[0] ) ),
                         min( lcontig, max( 0, interval[1] ) ) )
            
            regulons.append( interval )
            transcript_ids.append( transcript[0].transcript_id )

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have changed)
            regulons = Intervals.combine( regulons )
            transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )]
            
        gtf = GTF.Entry()
        gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id )
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write( "%s\n" % str(gtf) )
            nregulons += 1
            x += 1

    E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
コード例 #6
0
ファイル: gtf2gff.py プロジェクト: CGATOxford/cgat
def annotateRegulons(iterator, fasta, tss, options):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max([x.end for x in transcript])
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = (min(lcontig, max(0, interval[0])), min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have
            # changed)
            regulons = Intervals.combine(regulons)
            transcript_ids = ["%i" % (x + 1) for x in range(len(regulons))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            nregulons += 1
            x += 1

    E.info("ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons))
コード例 #7
0
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length:
            yield gffs
コード例 #8
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def iterator_min_feature_length(gff_iterator, min_length, feature="exon"):
    """select only those genes with a minimum length of a given feature."""
    for gffs in gff_iterator:
        intervals = [(x.start, x.end) for x in gffs if x.feature == feature]
        intervals = Intervals.combine(intervals)
        t = sum((x[1] - x[0] for x in intervals))
        if t >= min_length:
            yield gffs
コード例 #9
0
def get_windows(pvalues, window_size, threshold):

    # intervals are close closed
    windows = [(pos-window_size, pos+window_size+1)
               for pos in pvalues.index.values]

    merged_windows = Intervals.combine(windows)
    windows_min_p = [pvalues.ix[float(start):float(end-1)].min()
                    for start, end in merged_windows]
    return zip(merged_windows, windows_min_p)
コード例 #10
0
ファイル: gtf2gff.py プロジェクト: gsc0107/cgat
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
コード例 #11
0
ファイル: gtf2gff.py プロジェクト: gsc0107/cgat
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))
コード例 #12
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.iteritems():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
コード例 #13
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))
コード例 #14
0
def get_windows(pvalues, window_size, threshold):

    # intervals are close closed
    windows = [(pos - window_size, pos + window_size + 1)
               for pos in pvalues.index.values]

    merged_windows = Intervals.combine(windows)
    windows_min_p = [
        pvalues.ix[float(start):float(end - 1)].min()
        for start, end in merged_windows
    ]
    return zip(merged_windows, windows_min_p)
コード例 #15
0
ファイル: bed2table.py プロジェクト: siping/cgat
    def count( self, bed ):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ]
            except KeyError:
                overlaps = []

            results.append( (len(overlaps), 
                             Intervals.calculateOverlap( [(bed.start, bed.end),],
                                                         Intervals.combine( overlaps ) ) ) )

        self.data = results
コード例 #16
0
    def count( self, bed ):
        '''update internal counts.'''

        results = []
        for track in self.tracks:
            try:
                overlaps = [ (x[0],x[1]) for x in self.index[track][bed.contig].find( bed.start, bed.end ) ]
            except KeyError:
                overlaps = []

            results.append( (len(overlaps), 
                             Intervals.calculateOverlap( [(bed.start, bed.end),],
                                                         Intervals.combine( overlaps ) ) ) )

        self.data = results
コード例 #17
0
ファイル: GTF.py プロジェクト: CGATOxford/cgat
def toIntronIntervals(chunk):
    """convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    """
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand, chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #18
0
def findRetainedIntrons(infile, outfile):

    outf = IOTools.openFile(outfile, "w")

    for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))):

        gene_out = []
        introns_out = []

        # now find if any of the transcripts are retained intron
        # versions of any of the others
        for first, second in itertools.product(gene, gene):

            first = sorted(
                [entry for entry in first if entry.feature == "exon"],
                key=lambda x: x.start)
            second = sorted(
                [entry for entry in second if entry.feature == "exon"],
                key=lambda x: x.start)

            first_introns = set(GTF.toIntronIntervals(first))
            second_introns = set(GTF.toIntronIntervals(second))

            if len(first_introns-second_introns) > 0 and \
               len(second_introns-first_introns) == 0:
                novel_introns = list(first_introns - second_introns)

                def _filterIntron(intron):
                    return intron[0] > second[0].start and \
                        intron[1] < second[-1].end

                novel_introns = filter(_filterIntron, novel_introns)

                if len(novel_introns) > 0:
                    gene_out.extend(first)

                for intron in novel_introns:
                    introns_out.append(intron)

        introns_out = Intervals.combine(introns_out)
        template = gene[0][0]
        template.feature = "exon"
        for gff in introns_out:
            entry = GTF.Entry().copy(template)
            entry.start = gff[0]
            entry.end = gff[1]
            outf.write("%s\n" % str(entry))
コード例 #19
0
ファイル: GTF.py プロジェクト: prasoonnema/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, basestring):
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #20
0
ファイル: GTF.py プロジェクト: CGATOxford/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, str):
        gg = [x for x in gffs if x.feature == feature]
    elif feature:
        gg = [x for x in gffs if x.feature in feature]
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #21
0
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, str):
        gg = [x for x in gffs if x.feature == feature]
    elif feature:
        gg = [x for x in gffs if x.feature in feature]
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #22
0
ファイル: GTF.py プロジェクト: santayana/cgat
def asRanges(gffs, feature=None):
    """return ranges within a set of gffs.

    Overlapping intervals are merged.

    The returned intervals are sorted.
    """

    if isinstance(feature, basestring):
        gg = filter(lambda x: x.feature == feature, gffs)
    elif feature:
        gg = filter(lambda x: x.feature in feature, gffs)
    else:
        gg = gffs[:]

    r = [(g.start, g.end) for g in gg]
    return Intervals.combine(r)
コード例 #23
0
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will use first transcript_id found.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = (chunk[0].contig, chunk[0].strand,
                                     chunk[0].transcript_id)
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."

    intervals = Intervals.combine([(x.start, x.end) for x in chunk
                                   if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #24
0
ファイル: GTF.py プロジェクト: Charlie-George/cgat
def toIntronIntervals(chunk):
    '''convert a set of gtf elements within a transcript to intron coordinates.

    Will raise an error if more than one transcript is submitted.

    Note that coordinates will still be forward strand coordinates
    '''
    if len(chunk) == 0:
        return []
    contig, strand, transcript_id = chunk[
        0].contig, chunk[0].strand, chunk[0].transcript_id
    for gff in chunk:
        assert gff.strand == strand, "features on different strands."
        assert gff.contig == contig, "features on different contigs."
        assert gff.transcript_id == transcript_id, "more than one transcript submitted"

    intervals = Intervals.combine([(x.start, x.end)
                                   for x in chunk if x.feature == "exon"])
    return Intervals.complement(intervals)
コード例 #25
0
ファイル: gff2stats.py プロジェクト: jmadzo/cgat
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(
                str,
                (
                    len(self.counts_gene_ids),
                    len(self.counts_transcript_ids),
                    single_exon_transcripts,
                    Stats.Summary(exons_per_transcript),
                    Stats.Summary(exon_sizes),
                    Stats.Summary(intron_sizes),
                    Stats.Summary(transcript_lengths),
                ),
            )
        )
コード例 #26
0
ファイル: gtf2gtf.py プロジェクト: Q-KIM/cgat
def find_retained_introns(gene):
    '''Given a bundle of transcripts, find intervals matching retained
    introns. A retained intron is defined as an interval from an exon/intron
    boundary to the next where both boundaries are in the same exon of another
    transcript'''

    intron_intervals = [GTF.toIntronIntervals(transcript)
                        for transcript in gene]
    intron_intervals = list(set(
        itertools.chain.from_iterable(intron_intervals)))
    intron_intervals.sort()

    for transcript in gene:
        exons = iter(sorted(GTF.asRanges(transcript)))
        introns = iter(intron_intervals)
        retained_introns = []
        try:
            intron = introns.next()
            exon = exons.next()
            while True:

                if exon[1] < intron[0]:

                    exon = exons.next()
                    continue

                if intron[0] >= exon[0] and intron[1] <= exon[1]:
                    E.debug("exon %s of transcript %s contains intron %s" %
                            (exon, transcript[0].transcript_id, intron))
                    retained_introns.append(intron)
                intron = introns.next()
        except StopIteration:
            pass

        retained_introns = Intervals.combine(retained_introns)

        for intron in retained_introns:
            entry = GTF.Entry()
            entry = entry.copy(transcript[0])
            entry.start = intron[0]
            entry.end = intron[1]
            yield entry
コード例 #27
0
    def __str__(self):

        single_exon_transcripts = 0
        exons_per_transcript = []
        intron_sizes = []
        transcript_lengths = []
        exon_sizes = []

        for x in self.counts_exons_per_transcript.values():

            x.sort()
            x = Intervals.combine(x)
            transcript_lengths.append(x[-1][1] - x[0][0])

            exons_per_transcript.append(len(x))

            for start, end in x:
                exon_sizes.append(end - start)

            if len(x) == 1:
                single_exon_transcripts += 1
                continue

            last_end = x[0][1]
            for start, end in x[1:]:
                intron_sizes.append(start - last_end)
                last_end = end

        return "\t".join(
            map(str, (
                len(self.counts_gene_ids),
                len(self.counts_transcript_ids),
                single_exon_transcripts,
                Stats.Summary(exons_per_transcript),
                Stats.Summary(exon_sizes),
                Stats.Summary(intron_sizes),
                Stats.Summary(transcript_lengths),
            )))
コード例 #28
0
ファイル: annotator_distance.py プロジェクト: lesheng/cgat
def readWorkspace(infile,
                  workspace_builder="raw",
                  label="none",
                  map_id2annotation={}):
    """read workspace from infile.

    A workspace is a collection of intervals with two labels associated
    to each interval, one for the 5' and one for the 3' end.

    Available workspace builders are:

    gff
       take a gff file. 

    gtf-intergenic
       build workspace from intergenic segments in a gtf file. 

    gtf-intronic
       build workspace from intronic segments in a gtf file

    gtf-genic
       the workspace is built from genes (first to last exon).

    Available labels are:

    none
       no labels are given to the ends of workspaces

    direction
       labels are given based on the 5'/3' end of the
       bounding exon

    annotation
       labels are given based on a gene2annotation map.

    returns a list of segments for each contig in a dictionary
    """

    if label == "none":
        label_f = lambda x, y: (("X",), ("X",))
        info_f = lambda x: None
    elif label == "direction":
        label_f = lambda x, y: ((("5", "3")[x],), (("3", "5")[y],))
        info_f = lambda x: x.strand == "+"
    elif label == "annotation":
        label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
        info_f = lambda x: x.gene_id

    if workspace_builder == "gff":
        workspace = GTF.readAsIntervals(GFF.iterator(infile))

    elif workspace_builder == "gtf-intergenic":

        workspace = collections.defaultdict(list)
        # get all genes
        for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
            workspace[e.contig].append((e.start, e.end, info_f(e)))

        # convert to intergenic regions.
        # overlapping genes are merged and the labels
        # of the right-most entry is retained
        for contig in workspace.keys():
            segs = workspace[contig]
            segs.sort()
            last = segs[0]
            new_segs = []
            for this in segs[1:]:
                if last[1] >= this[0]:
                    if this[1] > last[1]:
                        last = (last[0], this[1], this[2])
                    continue
                assert last[1] < this[0], "this=%s, last=%s" % (this, last)

                new_segs.append((last[1], this[0],
                                 label_f(last[2], this[2])))
                last = this
            workspace[contig] = new_segs

    elif workspace_builder == "gtf-intronic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            introns = Intervals.complement(exons)

            r = ee[0]
            for start, end in introns:
                workspace[r.contig].append((start,
                                            end,
                                            label_f(info_f(r), info_f(r))
                                            ))
    elif workspace_builder == "gtf-genic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            start, end = exons[0][0], exons[-1][1]
            r = ee[0]
            workspace[r.contig].append((start,
                                        end,
                                        label_f(info_f(r), info_f(r))
                                        ))

    else:
        raise ValueError("unknown workspace_builder %s" % workspace_builder)

    return workspace
コード例 #29
0
ファイル: gff2fasta.py プロジェクト: gsc0107/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        extend_with=None,
                        masker=None,
                        fold_at=None,
                        naming_attribute=False)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with IOTools.openFile(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.Stop()
コード例 #30
0
ファイル: bed2bed.py プロジェクト: Q-KIM/cgat
def merge(iterator,
          max_distance=0,
          by_name=False,
          min_intervals=1,
          remove_inconsistent=False,
          resolve_blocks=False,
          stranded=False):
    """iterator for merging adjacent bed entries.

    *max_distance* > 0 permits merging of intervals that are
    not directly adjacent.

    If *by_name = True*, only entries with the same name are merged.

    If *remove_inconsistent*, overlapping intervals where the names
    are inconsistent will be removed.

    The score gives the number of intervals that have been merged.
    """

    if remove_inconsistent and by_name:
        assert ValueError(
            "using both remove_inconsistent and by_name makes no sense")

    def iterate_chunks(iterator):
        max_end = defaultdict(int)
        to_join = defaultdict(list)
        last_name = defaultdict(str)

        last = iterator.next()

        if not stranded:
            strand = "."
        else:
            strand = last.strand

        max_end[strand] = last.end
        to_join[strand] = [last]

        for bed in iterator:

            if not stranded:
                strand = "."
            else:
                strand = bed.strand

            d = bed.start - max_end[strand]

            if bed.contig == last.contig:
                assert bed.start >= last.start, \
                    "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \
                    % (d, last, bed)

            if bed.contig != last.contig:

                for s in to_join:
                    if to_join[s]:
                        yield to_join[s]
                    to_join[s] = []
                    max_end[s] = 0

            elif (d > max_distance or
                  (by_name and last_name[strand] != bed.name)):

                if to_join[strand]:
                    yield to_join[strand]

                to_join[strand] = []

            last = bed
            last_name[strand] = last.name
            max_end[strand] = max(bed.end, max_end[strand])
            to_join[strand].append(bed)

        for strand in to_join:
            if to_join[strand]:
                yield to_join[strand]
        raise StopIteration

    c = E.Counter()

    for to_join in iterate_chunks(iterator):

        c.input += 1

        if remove_inconsistent:
            names = set([x.name for x in to_join])
            if len(names) > 1:
                c.skipped_inconcistent_intervals += 1
                continue

        if resolve_blocks:
            
            # keep track of number of intervals in each entry
            for bed in to_join:
                bed["score"] = 1
  
            merged = True
            while merged:
                
                joined = []
                not_joined = []
                merged = False
                
                while len(to_join) > 0:
                    bed1, to_join = to_join[0], to_join[1:]
                    intervals1 = bed1.toIntervals()
                    for bed2 in to_join:
                        intervals2 = bed2.toIntervals()
                        if Intervals.calculateOverlap(intervals1, intervals2) > 0:
                            intervals = Intervals.combine(intervals1 +
                                                          intervals2)
                            bed1.fromIntervals(intervals)
                            bed1["score"] += bed2["score"]
                            merged = True
                        else:
                            not_joined.append(bed2)

                    joined.append(bed1)
                    to_join = not_joined
                    not_joined = []

                to_join = joined
                joined = []
                
            to_join = sorted(to_join, key=lambda x: int(x.start))
            
            # keep only those with the created from the merge of the minimum
            # number of intervals
            
            for bed in to_join:

                if bed["score"] < min_intervals:
                    c.skipped_min_intervals += 1
                    continue

                yield bed
                c.output += 1
        else:
                        
            if len(to_join) < min_intervals:
                c.skipped_min_intervals += 1
                continue

            a = to_join[0]
            a.end = max([entry.end for entry in to_join])
            a.score = len(to_join)
            yield a
            c.output += 1

    E.info(str(c))
コード例 #31
0
def merge(iterator,
          max_distance=0,
          by_name=False,
          min_intervals=1,
          remove_inconsistent=False,
          resolve_blocks=False,
          stranded=False):
    """iterator for merging adjacent bed entries.

    *max_distance* > 0 permits merging of intervals that are
    not directly adjacent.

    If *by_name = True*, only entries with the same name are merged.

    If *remove_inconsistent*, overlapping intervals where the names
    are inconsistent will be removed.

    The score gives the number of intervals that have been merged.
    """

    if remove_inconsistent and by_name:
        assert ValueError(
            "using both remove_inconsistent and by_name makes no sense")

    def iterate_chunks(iterator):
        max_end = defaultdict(int)
        to_join = defaultdict(list)
        last_name = defaultdict(str)

        last = next(iterator)

        if not stranded:
            strand = "."
        else:
            strand = last.strand

        max_end[strand] = last.end
        to_join[strand] = [last]

        for bed in iterator:

            if not stranded:
                strand = "."
            else:
                strand = bed.strand

            d = bed.start - max_end[strand]

            if bed.contig == last.contig:
                assert bed.start >= last.start, \
                    "input file should be sorted by contig and position: d=%i:\n%s\n%s\n" \
                    % (d, last, bed)

            if bed.contig != last.contig:

                for s in to_join:
                    if to_join[s]:
                        yield to_join[s]
                    to_join[s] = []
                    max_end[s] = 0

            elif (d > max_distance or (by_name and last_name[strand]
                                       and last_name[strand] != bed.name)):

                if to_join[strand]:
                    yield to_join[strand]

                to_join[strand] = list()

            last = bed
            last_name[strand] = last.name
            max_end[strand] = max(bed.end, max_end[strand])
            to_join[strand].append(bed)

        for strand in sorted(to_join):
            if to_join[strand]:
                yield to_join[strand]
        raise StopIteration

    c = E.Counter()

    for to_join in iterate_chunks(iterator):

        c.input += 1

        if remove_inconsistent:
            names = set([x.name for x in to_join])
            if len(names) > 1:
                c.skipped_inconsistent_intervals += 1
                continue

        if resolve_blocks:
            # keep track of number of intervals in each entry
            for bed in to_join:
                bed["score"] = 1
            merged = True
            while merged:
                joined = []
                not_joined = []
                merged = False

                while len(to_join) > 0:
                    bed1, to_join = to_join[0], to_join[1:]
                    intervals1 = bed1.toIntervals()
                    for bed2 in to_join:
                        intervals2 = bed2.toIntervals()
                        if Intervals.calculateOverlap(intervals1,
                                                      intervals2) > 0:
                            intervals = Intervals.combine(intervals1 +
                                                          intervals2)
                            bed1.fromIntervals(intervals)
                            bed1["score"] += bed2["score"]
                            merged = True
                        else:
                            not_joined.append(bed2)

                    joined.append(bed1)
                    to_join = not_joined
                    not_joined = []

                to_join = joined
                joined = []

            to_join = sorted(to_join, key=lambda x: int(x.start))

            # keep only those with the created from the merge of the minimum
            # number of intervals

            for bed in to_join:

                if bed["score"] < min_intervals:
                    c.skipped_min_intervals += 1
                    continue

                yield bed
                c.output += 1
        else:

            if len(to_join) < min_intervals:
                c.skipped_min_intervals += 1
                continue

            a = to_join[0]
            a.end = max([entry.end for entry in to_join])
            a.score = len(to_join)
            yield a
            c.output += 1

    E.info(str(c))
コード例 #32
0
ファイル: gtf2gtf.py プロジェクト: jmadzo/cgat
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t", "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes", dest="merge_genes", action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j", "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes", dest="unset_genes", type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene",
                               "gene+transcript",
                               "transcript",
                               "position",
                               "contig+gene",
                               "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u", "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i", "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g", "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G", "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f", "--filter", dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r", "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes", dest="renumber_genes", type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a", "--apply", dest="filename_filter", type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size", dest="sample_size", type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length", type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option("--intron-border",
                      dest="intron_border",
                      type="int",
                      help="number of residues to exclude at intron at either end "
                      "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping", dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates", dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates", dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates", dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(
                GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(
                    GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(
                        ("removing transcript '%s' due to "
                         "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute(
                "protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max(
                [x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)),
            sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(
                GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute(
                        "transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig,
                               min([x.start for x in gff]),
                               max([x.end for x in gff]),
                               gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end)
                                      for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)]
                                 for x in transcript if x.feature == "exon"])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                transcript = _select(gene)
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max(
                [x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids)
                          if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute('gene_id',
                                     gtf.gene_id + "." +
                                     str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute('transcript_id',
                                     gtf.transcript_id + "." +
                                     str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(
                GTF.iterator(options.stdin),
                strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (
                    gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (
                    gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
コード例 #33
0
ファイル: gff2psl.py プロジェクト: santayana/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.Start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gff.transcript_id
        entry.mSbjctId = gff.contig
        entry.strand = gff.strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.Stop()
コード例 #34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-m",
        "--merge",
        dest="merge",
        action="store_true",
        help="merge adjacent intervals with the same attributes. "
        "[default=%default]")

    parser.add_option(
        "-e",
        "--feature",
        dest="feature",
        type="string",
        help="filter by a feature, for example 'exon', 'CDS'. If "
        "set to the empty string, all entries are output [%default].")

    parser.add_option(
        "-f",
        "--filename-masks",
        dest="filename_masks",
        type="string",
        metavar="gff",
        help="mask sequences with regions given in gff file [%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option(
        "--min-length",
        dest="min_length",
        type="int",
        help="set minimum length for sequences output [%default]")

    parser.add_option(
        "--max-length",
        dest="max_length",
        type="int",
        help="set maximum length for sequences output [%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        merge=False,
                        feature=None,
                        filename_masks=None,
                        remove_masked_regions=False,
                        min_length=0,
                        max_length=0,
                        extend_at=None,
                        extend_by=100,
                        masker=None)

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin))
    else:
        gffs = GTF.iterator(sys.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with open(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GFF.iterator(infile))

        # convert intervals to intersectors
        for contig in e.keys():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    #    for item in iterator:
    #	print len(item) # 3, 2
    #	for i in item:
    #	   print len(i) # 9, 9, 9, 9, 9
    #	   print i.contig
    #	   print i.strand
    #	   print i.transcript_id

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = filter(lambda x: x.feature == feature, ichunk)
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from %s:%i..%i - %s" %
                   (ichunk[0].contig, ichunk[0].start, ichunk[0].end,
                    str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions): nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise "unimplemented"

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\
                                                  (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) )
                    continue

        out = intervals

        if options.extend_at:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        #IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if l < options.min_length or (options.max_length
                                      and l > options.max_length):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\
                                          (name, str(intervals), l) )
            continue

        options.stdout.write(
            ">%s %s:%s:%s\n%s\n" %
            (name, contig, strand, ";".join(["%i-%i" % x
                                             for x in out]), "\n".join(s)))

        noutput += 1

    E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\
                (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) )

    E.Stop()
コード例 #35
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--merge-exons",
                      dest="merge_exons",
                      action="store_true",
                      help="merge overlapping exons of all transcripts "
                      "within a gene. "
                      "The merged exons will be output. "
                      "Input needs to sorted by gene [default=%default].")

    parser.add_option("-t",
                      "--merge-transcripts",
                      dest="merge_transcripts",
                      action="store_true",
                      help="merge all transcripts within a gene. "
                      "The entry will span the whole gene "
                      "(exons and introns). "
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. [default=%default].")

    parser.add_option("--merge-genes",
                      dest="merge_genes",
                      action="store_true",
                      help="merge overlapping genes if their exons overlap. "
                      "A gene with a single transcript containing all exons "
                      "of the overlapping transcripts will be output. "
                      "This operation ignores strand information "
                      "The input needs te sorted by transcript "
                      "[default=%default].")

    parser.add_option("--merge-exons-distance",
                      dest="merge_exons_distance",
                      type="int",
                      help="distance in nucleotides between "
                      "exons to be merged [default=%default].")

    parser.add_option("-j",
                      "--join-exons",
                      dest="join_exons",
                      action="store_true",
                      help="join all exons per transcript. "
                      "A new transcript will be "
                      "output that spans a whole transcript. "
                      "Input needs to be sorted by transcript "
                      "[default=%default].")

    parser.add_option("--unset-genes",
                      dest="unset_genes",
                      type="string",
                      help="unset gene identifiers, keeping "
                      "transcripts intact. "
                      "New gene identifiers are set to the "
                      "pattern given. For example, "
                      "'--unset-genes=%06i' [default=%default].")

    parser.add_option("--sort",
                      dest="sort",
                      type="choice",
                      choices=("gene", "gene+transcript", "transcript",
                               "position", "contig+gene", "position+gene",
                               "gene+position"),
                      help="sort input data [default=%default].")

    parser.add_option("-u",
                      "--with-utr",
                      dest="with_utr",
                      action="store_true",
                      help="include utr in merged transcripts "
                      "[default=%default].")

    parser.add_option("--intersect-transcripts",
                      dest="intersect_transcripts",
                      action="store_true",
                      help="intersect all transcripts within a gene. "
                      "The entry will only span those bases "
                      "that are covered by all transcrips."
                      "The transcript does not include the UTR unless "
                      "--with-utr is set. This method "
                      "will remove all other features (stop_codon, etc.) "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-i",
                      "--merge-introns",
                      dest="merge_introns",
                      action="store_true",
                      help="merge and output all introns within a "
                      "gene. The output will contain "
                      "all intronic regions within a gene. Single exon genes "
                      "are skipped. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-g",
                      "--set-transcript-to-gene",
                      "--set-transcript2gene",
                      dest="set_transcript2gene",
                      action="store_true",
                      help="set the transcript_id to the "
                      "gene_id [default=%default].")

    parser.add_option("--set-protein-to-transcript",
                      dest="set_protein2transcript",
                      action="store_true",
                      help="set the protein_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("--add-protein-id",
                      dest="add_protein_id",
                      type="string",
                      help="add a protein_id for each transcript_id. "
                      "The argument is a filename containing a mapping "
                      "between "
                      "transcript_id to protein_id [default=%default].")

    parser.add_option("-G",
                      "--set-gene-to-transcript",
                      "--set-gene2transcript",
                      dest="set_gene2transcript",
                      action="store_true",
                      help="set the gene_id to the "
                      "transcript_id [default=%default].")

    parser.add_option("-d",
                      "--set-score2distance",
                      dest="set_score2distance",
                      action="store_true",
                      help="set the score field for each feature to the "
                      "distance to "
                      "transcription start site [default=%default].")

    parser.add_option("--exons2introns",
                      dest="exons2introns",
                      action="store_true",
                      help="for each gene build an 'intronic' transcript "
                      "containing the union of all intronic regions "
                      "of all transcripts in a gene."
                      "The features are labeled as 'intron'."
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      type="choice",
                      choices=("gene", "transcript", "longest-gene",
                               "longest-transcript",
                               "representative-transcript"),
                      help="apply a filter to the input file. Available "
                      "filters are: "
                      "'gene': filter by gene_id, "
                      "'transcript': filter by transcript_id, "
                      "'longest-gene': output the longest gene for "
                      "overlapping genes ,"
                      "'longest-transcript': output the longest "
                      "transcript per gene,"
                      "'representative-transcript': output the "
                      "representative transcript per gene. "
                      "The representative transcript is the transcript "
                      "that shares most exons with "
                      "the other transcripts in a gene. "
                      "The input needs to be sorted by gene. "
                      "[default=%default].")

    parser.add_option("-r",
                      "--rename",
                      dest="rename",
                      type="choice",
                      choices=("gene", "transcript"),
                      help="rename genes or transcripts with a map "
                      "given by the option `--apply`. "
                      "Those that can not be renamed are removed "
                      "[default=%default].")

    parser.add_option("--renumber-genes",
                      dest="renumber_genes",
                      type="string",
                      help="renumber genes according to the given pattern. "
                      "[default=%default].")

    parser.add_option("--renumber-transcripts",
                      dest="renumber_transcripts",
                      type="string",
                      help="renumber transcripts according to the "
                      "given pattern. "
                      "[default=%default].")

    parser.add_option("-a",
                      "--apply",
                      dest="filename_filter",
                      type="string",
                      metavar="tsv",
                      help="filename of ids to map/filter [default=%default].")

    parser.add_option("--invert-filter",
                      dest="invert_filter",
                      action="store_true",
                      help="when using --filter, invert selection "
                      "(like grep -v). "
                      "[default=%default].")

    parser.add_option("--sample-size",
                      dest="sample_size",
                      type="int",
                      help="extract a random sample of size # if the option "
                      "'--filter' is set[default=%default].")

    parser.add_option("--intron-min-length",
                      dest="intron_min_length",
                      type="int",
                      help="minimum length for introns (for --exons2introns) "
                      "[default=%default].")

    parser.add_option("--min-exons-length",
                      dest="min_exons_length",
                      type="int",
                      help="minimum length for gene (sum of exons) "
                      "(--sample-size) [default=%default].")

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end "
        "(--exons2introns) [default=%default].")

    parser.add_option("--transcripts2genes",
                      dest="transcripts2genes",
                      action="store_true",
                      help="cluster overlapping transcripts into genes.")

    parser.add_option("--reset-strand",
                      dest="reset_strand",
                      action="store_true",
                      help="remove strandedness of features (set to '.') when "
                      "using --transcripts2genes"
                      "[default=%default].")

    parser.add_option("--remove-overlapping",
                      dest="remove_overlapping",
                      type="string",
                      metavar="gff",
                      help="remove all transcripts that overlap intervals "
                      "in a gff-formatted file."
                      "The comparison ignores strand "
                      "[default=%default].")

    parser.add_option("--permit-duplicates",
                      dest="strict",
                      action="store_false",
                      help="permit duplicate genes. "
                      "[default=%default]")

    parser.add_option("--remove-duplicates",
                      dest="remove_duplicates",
                      type="choice",
                      choices=("gene", "transcript", "ucsc", "coordinates"),
                      help="remove duplicates by gene/transcript. "
                      "If ``ucsc`` is chosen, transcripts ending on _dup# are "
                      "removed. This is necessary to remove duplicate entries "
                      "that are next to each other in the sort order "
                      "[%default]")

    parser.add_option("--rename-duplicates",
                      dest="rename_duplicates",
                      action="store_true",
                      help="rename duplicate gene_ids and transcript_ids by "
                      "addition of a numerical suffix")

    parser.set_defaults(
        sort=None,
        merge_exons=False,
        join_exons=False,
        merge_exons_distance=0,
        merge_transcripts=False,
        set_score2distance=False,
        set_gene2transcript=False,
        set_transcript2gene=False,
        set_protein2transcript=False,
        add_protein_id=None,
        filename_filter=None,
        filter=None,
        exons2introns=None,
        merge_genes=False,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        transripts2genes=False,
        reset_strand=False,
        with_utr=False,
        invert_filter=False,
        remove_duplicates=None,
        remove_overlapping=None,
        renumber_genes=None,
        unset_genes=None,
        renumber_transcripts=None,
        strict=True,
        intersect_transcripts=False,
        rename_duplicates=False,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.set_transcript2gene:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.remove_duplicates:

        counts = collections.defaultdict(int)

        if options.remove_duplicates == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                           strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.remove_duplicates == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin),
                                         strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(
                    ["\n".join([str(y) for y in xx]) for xx in x])
            elif options.remove_duplicates == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin),
                                               strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.remove_duplicates == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + \
                    str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.remove_duplicates == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" %
                               (id, counts[id]))

    elif options.sort:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin),
                                       sort_order=options.sort):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.set_gene2transcript:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.set_protein2transcript:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif options.add_protein_id:

        transcript2protein = IOTools.readMap(open(options.add_protein_id, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to "
                             "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id",
                             transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" %
               len(missing))

    elif options.join_exons:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons
                                      ]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif options.merge_genes:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator(
            GTF.iterator(options.stdin)),
                                          sort_by="contig-strand-start")

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and \
                   gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, \
                        ("input file should be sorted by contig, strand "
                         "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \
                        (d,
                         "\n".join([str(x) for x in last]),
                         "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or \
                        gffs[0].strand != last[0].strand or \
                        d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.renumber_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.renumber_genes % (
                    len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.unset_genes:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.unset_genes % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.renumber_transcripts:

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.renumber_transcripts % (
                    len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.transcripts2genes:

        transcripts = set()
        genes = set()
        reset_strand = options.reset_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(
                options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if reset_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" %
               (len(transcripts), len(genes)))

    elif options.rename:

        map_old2new = IOTools.readMap(open(options.filename_filter, "r"))

        if options.rename == "transcript":
            is_gene_id = False
        elif options.rename == "gene":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id",
                                     map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" %
                            gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.filter:

        keep_genes = set()
        if options.filter == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]),
                               max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter in ("longest-transcript",
                                "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                '''select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                '''
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript
                                      if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([
                        exon_counts[(x.start, x.end)] for x in transcript
                        if x.feature == "exon"
                    ])
                    transcript_counts.append((count, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][1]

            if options.filter == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(
                    open(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter == "gene"
                by_transcript = options.filter == "transcript"
                invert = options.invert_filter

                reset_strand = options.reset_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if reset_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter == "gene":
                    iterator = GTF.flat_gene_iterator(
                        GTF.iterator(options.stdin))
                elif options.filter == "transcript":
                    iterator = GTF.transcript_iterator(
                        GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator,
                        min_length=options.min_exons_length,
                        feature="exon")

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--apply) or a sample-size."

    elif options.exons2introns:

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b)
                                     for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [
                        x for x in output_ranges if x[1] - x[0] > l
                    ]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.set_score2distance:

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs
                                      ]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.remove_overlapping:

        index = GTF.readAndIndex(
            GTF.iterator(IOTools.openFile(options.remove_overlapping, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.intersect_transcripts:

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin),
                                      strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif options.rename_duplicates:

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [
            item for item in set(transcript_ids)
            if transcript_ids.count(item) > 1
        ]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript,
                                   ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if gtf.gene_id in dup_gene:
                    gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                    gtf.setAttribute(
                        'gene_id',
                        gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if gtf.transcript_id in dup_transcript:
                    transcript_dict[gtf.transcript_id] = \
                        transcript_dict[gtf.transcript_id] + 1
                    gtf.setAttribute(
                        'transcript_id', gtf.transcript_id + "." +
                        str(transcript_dict[gtf.transcript_id]))

            options.stdout.write("%s\n" % gtf)

    else:
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin),
                                           strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple strands: %s" %
                    (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError(
                    "can not merge gene '%s' on multiple contigs: %s" %
                    (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            if options.merge_exons:
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(
                    output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.merge_transcripts:

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.merge_introns:

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" %
           (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()
コード例 #36
0
ファイル: gtf2gff.py プロジェクト: gsc0107/cgat
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end)
                     for x in transcript if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in range(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
コード例 #37
0
ファイル: gtf2gff.py プロジェクト: Charlie-George/cgat
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end)
                     for x in transcript if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in xrange(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
コード例 #38
0
def readWorkspace(infile,
                  workspace_builder="raw",
                  label="none",
                  map_id2annotation={}):
    """read workspace from infile.

    A workspace is a collection of intervals with two labels associated
    to each interval, one for the 5' and one for the 3' end.

    Available workspace builders are:

    gff
       take a gff file.

    gtf-intergenic
       build workspace from intergenic segments in a gtf file.

    gtf-intronic
       build workspace from intronic segments in a gtf file

    gtf-genic
       the workspace is built from genes (first to last exon).

    Available labels are:

    none
       no labels are given to the ends of workspaces

    direction
       labels are given based on the 5'/3' end of the
       bounding exon

    annotation
       labels are given based on a gene2annotation map.

    returns a list of segments for each contig in a dictionary
    """

    if label == "none":
        label_f = lambda x, y: (("X", ), ("X", ))
        info_f = lambda x: None
    elif label == "direction":
        label_f = lambda x, y: ((("5", "3")[x], ), (("3", "5")[y], ))
        info_f = lambda x: x.strand == "+"
    elif label == "annotation":
        label_f = lambda x, y: (map_id2annotation[x], map_id2annotation[y])
        info_f = lambda x: x.gene_id

    if workspace_builder == "gff":
        workspace = GTF.readAsIntervals(GFF.iterator(infile))

    elif workspace_builder == "gtf-intergenic":

        workspace = collections.defaultdict(list)
        # get all genes
        for e in GTF.merged_gene_iterator(GTF.iterator(infile)):
            workspace[e.contig].append((e.start, e.end, info_f(e)))

        # convert to intergenic regions.
        # overlapping genes are merged and the labels
        # of the right-most entry is retained
        for contig in list(workspace.keys()):
            segs = workspace[contig]
            segs.sort()
            last = segs[0]
            new_segs = []
            for this in segs[1:]:
                if last[1] >= this[0]:
                    if this[1] > last[1]:
                        last = (last[0], this[1], this[2])
                    continue
                assert last[1] < this[0], "this=%s, last=%s" % (this, last)

                new_segs.append((last[1], this[0], label_f(last[2], this[2])))
                last = this
            workspace[contig] = new_segs

    elif workspace_builder == "gtf-intronic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            introns = Intervals.complement(exons)

            r = ee[0]
            for start, end in introns:
                workspace[r.contig].append(
                    (start, end, label_f(info_f(r), info_f(r))))
    elif workspace_builder == "gtf-genic":

        workspace = collections.defaultdict(list)

        # the current procedure will count nested genes
        # twice
        for ee in GTF.flat_gene_iterator(GTF.iterator(infile)):

            exons = Intervals.combine([(e.start, e.end) for e in ee])
            start, end = exons[0][0], exons[-1][1]
            r = ee[0]
            workspace[r.contig].append(
                (start, end, label_f(info_f(r), info_f(r))))

    else:
        raise ValueError("unknown workspace_builder %s" % workspace_builder)

    return workspace
コード例 #39
0
ファイル: gff2fasta.py プロジェクト: SCV/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-m", "--merge-adjacent", dest="merge", action="store_true",
        help="merge adjacent intervals with the same attributes."
        " [default=%default]")

    parser.add_option(
        "-e", "--feature", dest="feature", type="string",
        help="filter by a feature, for example 'exon', 'CDS'."
        " If set to the empty string, all entries are output "
        "[%default].")

    parser.add_option(
        "-f", "--maskregions-bed-file", dest="filename_masks",
        type="string", metavar="gff",
        help="mask sequences with regions given in gff file "
        "[%default].")

    parser.add_option(
        "--remove-masked-regions", dest="remove_masked_regions",
        action="store_true",
        help="remove regions instead of masking [%default].")

    parser.add_option(
        "--min-interval-length", dest="min_length", type="int",
        help="set minimum length for sequences output "
        "[%default]")

    parser.add_option(
        "--max-length", dest="max_length", type="int",
        help="set maximum length for sequences output "
        "[%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at no end, 3', 5' or both ends. If "
        "3only or 5only are set, only the added sequence "
        "is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--extend-with", dest="extend_with", type="string",
        help="extend using base [default=%default]")

    parser.add_option(
        "--masker", dest="masker", type="choice",
        choices=("dust", "dustmasker", "softmask", "none"),
        help="apply masker [%default].")

    parser.add_option(
        "--fold-at", dest="fold_at", type="int",
        help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute", dest="naming_attribute", type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False
    )

    (options, args) = E.Start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with open(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in e.keys():
            intersector = bx.intervals.intersection.Intersecter()
            for start, end in e[contig]:
                intersector.add_interval(bx.intervals.Interval(start, end))
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

#    for item in iterator:
# print len(item) # 3, 2
#	for i in item:
# print len(i) # 9, 9, 9, 9, 9
#	   print i.contig
#	   print i.strand
#	   print i.transcript_id

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = filter(lambda x: x.feature == feature, ichunk)
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig,
                                       ichunk[0].start,
                                       ichunk[0].end,
                                       str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand
        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {x.split("=")[0]: x.split("=")[1]
                             for x in chunk[0].attributes.split(";")}
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(start, end)]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise "unimplemented"

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write("# skipped because fully masked: "
                                             "%s: regions=%s masks=%s\n" %
                                             (name,
                                              str([(x.start,
                                                    x.end) for x in chunk]),
                                              masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [fasta.getSequence(contig, strand, start, end)
             for start, end in intervals]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length or
                (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with,) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i+n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        options.stdout.write(">%s %s:%s:%s\n%s\n" % (name,
                                                     contig,
                                                     strand,
                                                     ";".join(
                                                         ["%i-%i" %
                                                          x for x in out]),
                                                     seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons,
            nskipped_masked, nskipped_length))

    E.Stop()
コード例 #40
0
ファイル: gtf2gtf.py プロジェクト: mmaarriiee/cgat
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"])

    parser.add_option(
        "--merge-exons-distance",
        dest="merge_exons_distance",
        type="int",
        help="distance in nucleotides between " "exons to be merged [%default].",
    )

    parser.add_option(
        "--pattern-identifier",
        dest="pattern",
        type="string",
        help="pattern to use for renaming genes/transcripts. "
        "The pattern should contain a %i, for example "
        "--pattern-identifier=ENSG%010i [%default].",
    )

    parser.add_option(
        "--sort-order",
        dest="sort_order",
        type="choice",
        choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"),
        help="sort input data [%default].",
    )

    parser.add_option(
        "-u",
        "--with-utr",
        dest="with_utr",
        action="store_true",
        help="include utr in merged transcripts " "[%default].",
    )

    parser.add_option(
        "--filter-method",
        dest="filter_method",
        type="choice",
        choices=(
            "gene",
            "transcript",
            "longest-gene",
            "longest-transcript",
            "representative-transcript",
            "proteincoding",
            "lincrna",
        ),
        help="Filter method to apply. Available filters are: "
        "'gene': filter by gene_id given in ``--map-tsv-file``, "
        "'transcript': filter by transcript_id given in ``--map-tsv-file``, "
        "'longest-gene': output the longest gene for overlapping genes ,"
        "'longest-transcript': output the longest transcript per gene,"
        "'representative-transcript': output the representative transcript "
        "per gene. The representative transcript is the transcript "
        "that shares most exons with other transcripts in a gene. "
        "The input needs to be sorted by gene. "
        "'proteincoding': only output protein coding features. "
        "'lincrna': only output lincRNA features. "
        "[%default].",
    )

    parser.add_option(
        "-a",
        "--map-tsv-file",
        dest="filename_filter",
        type="string",
        metavar="tsv",
        help="filename of ids to map/filter [%default].",
    )

    parser.add_option(
        "--gff-file",
        dest="filename_gff",
        type="string",
        metavar="GFF",
        help="second filename of features (see --remove-overlapping) " "[%default]",
    )

    parser.add_option(
        "--invert-filter",
        dest="invert_filter",
        action="store_true",
        help="when using --filter, invert selection " "(like grep -v). " "[%default].",
    )

    parser.add_option(
        "--sample-size",
        dest="sample_size",
        type="int",
        help="extract a random sample of size # if the option "
        "'--method=filter --filter-method' is set "
        "[%default].",
    )

    parser.add_option(
        "--intron-min-length",
        dest="intron_min_length",
        type="int",
        help="minimum length for introns (for --exons-file2introns) " "[%default].",
    )

    parser.add_option(
        "--min-exons-length",
        dest="min_exons_length",
        type="int",
        help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].",
    )

    parser.add_option(
        "--intron-border",
        dest="intron_border",
        type="int",
        help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].",
    )

    parser.add_option(
        "--ignore-strand",
        dest="ignore_strand",
        action="store_true",
        help="remove strandedness of features (set to '.') when "
        "using ``transcripts2genes`` or ``filter``"
        "[%default].",
    )

    parser.add_option(
        "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]"
    )

    parser.add_option(
        "--duplicate-feature",
        dest="duplicate_feature",
        type="choice",
        choices=("gene", "transcript", "both", "ucsc", "coordinates"),
        help="remove duplicates by gene/transcript. "
        "If ``ucsc`` is chosen, transcripts ending on _dup# are "
        "removed. This is necessary to remove duplicate entries "
        "that are next to each other in the sort order "
        "[%default]",
    )

    parser.add_option(
        "-m",
        "--method",
        dest="method",
        type="choice",
        action="append",
        choices=(
            "add-protein-id",
            "exons2introns",
            "filter",
            "find-retained-introns",
            "genes-to-unique-chunks",
            "intersect-transcripts",
            "join-exons",
            "merge-exons",
            "merge-transcripts",
            "merge-genes",
            "merge-introns",
            "remove-overlapping",
            "remove-duplicates",
            "rename-genes",
            "rename-transcripts",
            "rename-duplicates",
            "renumber-genes",
            "renumber-transcripts",
            "set-transcript-to-gene",
            "set-gene-to-transcript",
            "set-protein-to-transcript",
            "set-score-to-distance",
            "set-gene_biotype-to-source",
            "sort",
            "transcript2genes",
            "unset-genes",
        ),
        help="Method to apply [%default]." "Please only select one.",
    )

    parser.set_defaults(
        sort_order="gene",
        filter_method="gene",
        pattern="%i",
        merge_exons_distance=0,
        filename_filter=None,
        intron_border=None,
        intron_min_length=None,
        sample_size=0,
        min_exons_length=0,
        ignore_strand=False,
        with_utr=False,
        invert_filter=False,
        duplicate_feature=None,
        strict=True,
        method=None,
    )

    (options, args) = E.Start(parser, argv=argv)

    ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0

    if options.method is None:
        raise ValueError("please specify a --method")

    if len(options.method) > 1:
        raise ValueError("multiple --method arguements specified")
    else:
        options.method = options.method[0]

    if options.method == "set-transcript-to-gene":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("transcript_id", gff.gene_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "set-gene_biotype-to-source":

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            if "gene_biotype" not in gff:
                gff.setAttribute("gene_biotype", gff.source)

            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif options.method == "remove-duplicates":

        counts = collections.defaultdict(int)

        if options.duplicate_feature == "ucsc":
            store = []
            remove = set()
            f = lambda x: x[0].transcript_id

            gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
            outf = lambda x: "\n".join([str(y) for y in x])

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                if "_dup" in id:
                    remove.add(re.sub("_dup\d+", "", id))
                    remove.add(id)

            for entry in store:
                id = f(entry)
                if id not in remove:
                    options.stdout.write(outf(entry) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1
                    E.info("discarded duplicates for %s" % (id))
        else:

            if options.duplicate_feature == "gene":
                gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0][0].gene_id
                outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x])
            elif options.duplicate_feature == "transcript":
                gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False)
                f = lambda x: x[0].transcript_id
                outf = lambda x: "\n".join([str(y) for y in x])
            elif options.duplicate_feature == "coordinates":
                gffs = GTF.chunk_iterator(GTF.iterator(options.stdin))
                f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end)
                outf = lambda x: "\n".join([str(y) for y in x])

            store = []

            for entry in gffs:
                ninput += 1
                store.append(entry)
                id = f(entry)
                counts[id] += 1

            # Assumes GTF file sorted by contig then start
            last_id = ""
            if options.duplicate_feature == "coordinates":
                for entry in store:
                    id = f(entry)
                    if id == last_id:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))
                    else:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    last_id = id

            else:
                for entry in store:
                    id = f(entry)
                    if counts[id] == 1:
                        options.stdout.write(outf(entry) + "\n")
                        noutput += 1
                    else:
                        ndiscarded += 1
                        E.info("discarded duplicates for %s: %i" % (id, counts[id]))

    elif "sort" == options.method:

        for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order):
            ninput += 1
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "set-gene-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):

            ninput += 1

            gff.setAttribute("gene_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))

            noutput += 1
            nfeatures += 1

    elif "set-protein-to-transcript" == options.method:

        for gff in GTF.iterator(options.stdin):
            ninput += 1
            gff.setAttribute("protein_id", gff.transcript_id)
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

    elif "add-protein-id" == options.method:

        transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        missing = set()
        for gff in GTF.iterator(options.stdin):
            ninput += 1
            if gff.transcript_id not in transcript2protein:
                if gff.transcript_id not in missing:
                    E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id)
                    missing.add(gff.transcript_id)
                ndiscarded += 1
                continue

            gff.setAttribute("protein_id", transcript2protein[gff.transcript_id])
            options.stdout.write("%s\n" % str(gff))
            noutput += 1
            nfeatures += 1

        E.info("transcripts removed due to missing protein ids: %i" % len(missing))

    elif "join-exons" == options.method:

        for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(exons[0].strand)
            contig = exons[0].contig
            transid = exons[0].transcript_id
            geneid = exons[0].gene_id
            biotype = exons[0].source
            all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons])
            y = GTF.Entry()
            y.contig = contig
            y.source = biotype
            y.feature = "transcript"
            y.start = all_start
            y.end = all_end
            y.strand = strand
            y.transcript_id = transid
            y.gene_id = geneid
            options.stdout.write("%s\n" % str(y))

    elif "merge-genes" == options.method:
        # merges overlapping genes
        #
        gffs = GTF.iterator_sorted_chunks(
            GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start"
        )

        def iterate_chunks(gff_chunks):

            last = gff_chunks.next()
            to_join = [last]

            for gffs in gff_chunks:
                d = gffs[0].start - last[-1].end

                if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand:
                    assert gffs[0].start >= last[0].start, (
                        "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n"
                    ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs]))

                if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0:
                    yield to_join
                    to_join = []

                last = gffs
                to_join.append(gffs)

            yield to_join
            raise StopIteration

        for chunks in iterate_chunks(gffs):
            ninput += 1
            if len(chunks) > 1:
                gene_id = "merged_%s" % chunks[0][0].gene_id
                transcript_id = "merged_%s" % chunks[0][0].transcript_id
                info = ",".join([x[0].gene_id for x in chunks])
            else:
                gene_id = chunks[0][0].gene_id
                transcript_id = chunks[0][0].transcript_id
                info = None

            intervals = []
            for c in chunks:
                intervals += [(x.start, x.end) for x in c]

            intervals = Intervals.combine(intervals)
            # take single strand
            strand = chunks[0][0].strand

            for start, end in intervals:
                y = GTF.Entry()
                y.fromGTF(chunks[0][0], gene_id, transcript_id)
                y.start = start
                y.end = end
                y.strand = strand

                if info:
                    y.addAttribute("merged", info)
                options.stdout.write("%s\n" % str(y))
                nfeatures += 1

            noutput += 1

    elif options.method == "renumber-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            if gtf.gene_id not in map_old2new:
                map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[gtf.gene_id])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "unset-genes":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = gtf.transcript_id
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("gene_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "renumber-transcripts":

        map_old2new = {}
        for gtf in GTF.iterator(options.stdin):
            ninput += 1
            key = (gtf.gene_id, gtf.transcript_id)
            if key not in map_old2new:
                map_old2new[key] = options.pattern % (len(map_old2new) + 1)
            gtf.setAttribute("transcript_id", map_old2new[key])
            options.stdout.write("%s\n" % str(gtf))
            noutput += 1

    elif options.method == "transcripts2genes":

        transcripts = set()
        genes = set()
        ignore_strand = options.ignore_strand
        for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)):

            ninput += 1
            for gtf in gtfs:
                if ignore_strand:
                    gtf.strand = "."
                options.stdout.write("%s\n" % str(gtf))
                transcripts.add(gtf.transcript_id)
                genes.add(gtf.gene_id)
                nfeatures += 1
            noutput += 1

        E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes)))

    elif options.method in ("rename-genes", "rename-transcripts"):

        map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r"))

        if options.method == "rename-transcripts":
            is_gene_id = False
        elif options.method == "rename-genes":
            is_gene_id = True

        for gff in GTF.iterator(options.stdin):
            ninput += 1

            if is_gene_id:
                if gff.gene_id in map_old2new:
                    gff.setAttribute("gene_id", map_old2new[gff.gene_id])
                else:
                    E.debug("removing missing gene_id %s" % gff.gene_id)
                    ndiscarded += 1
                    continue

            else:
                if gff.transcript_id in map_old2new:
                    gff.setAttribute("transcript_id", map_old2new[gff.transcript_id])
                else:
                    E.debug("removing missing transcript_id %s" % gff.transcript_id)
                    ndiscarded += 1
                    continue

            noutput += 1
            options.stdout.write("%s\n" % str(gff))

    elif options.method == "filter":

        keep_genes = set()
        if options.filter_method == "longest-gene":
            iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
            coords = []
            gffs = []
            for gff in iterator:
                gff.sort(key=lambda x: x.start)
                coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id))
                gffs.append(gff)
            coords.sort()

            last_contig = None
            max_end = 0
            longest_gene_id = None
            longest_length = None

            for contig, start, end, gene_id in coords:
                ninput += 1
                if contig != last_contig or start >= max_end:
                    if longest_gene_id:
                        keep_genes.add(longest_gene_id)
                    longest_gene_id = gene_id
                    longest_length = end - start
                    max_end = end
                else:
                    if end - start > longest_length:
                        longest_length, longest_gene_id = end - start, gene_id
                last_contig = contig
                max_end = max(max_end, end)

            keep_genes.add(longest_gene_id)
            invert = options.invert_filter
            for gff in gffs:
                keep = gff[0].gene_id in keep_genes

                if (keep and not invert) or (not keep and invert):
                    noutput += 1
                    for g in gff:
                        nfeatures += 1
                        options.stdout.write("%s\n" % g)
                else:
                    ndiscarded += 1
        elif options.filter_method in ("longest-transcript", "representative-transcript"):

            iterator = GTF.gene_iterator(GTF.iterator(options.stdin))

            def selectLongestTranscript(gene):
                r = []
                for transcript in gene:
                    transcript.sort(key=lambda x: x.start)
                    length = transcript[-1].end - transcript[0].start
                    r.append((length, transcript))
                r.sort()
                return r[-1][1]

            def selectRepresentativeTranscript(gene):
                """select a representative transcript.

                The representative transcript represent the largest number
                of exons over all transcripts.
                """
                all_exons = []
                for transcript in gene:
                    all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"])
                exon_counts = {}
                for key, exons in itertools.groupby(all_exons):
                    exon_counts[key] = len(list(exons))
                transcript_counts = []
                for transcript in gene:
                    count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"])
                    # add transcript id to sort to provide a stable
                    # segmentation.
                    transcript_counts.append((count, transcript[0].transcript_id, transcript))
                transcript_counts.sort()
                return transcript_counts[-1][-1]

            if options.filter_method == "longest-transcript":
                _select = selectLongestTranscript
            elif options.filter_method == "representative-transcript":
                _select = selectRepresentativeTranscript

            for gene in iterator:
                ninput += 1
                # sort in order to make reproducible which
                # gene is chosen.
                transcript = _select(sorted(gene))
                noutput += 1
                for g in transcript:
                    nfeatures += 1
                    options.stdout.write("%s\n" % g)

        elif options.filter_method in ("gene", "transcript"):

            if options.filename_filter:

                ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r"))
                E.info("read %i ids" % len(ids))

                ids = set(ids)
                by_gene = options.filter_method == "gene"
                by_transcript = options.filter_method == "transcript"
                invert = options.invert_filter

                ignore_strand = options.ignore_strand
                for gff in GTF.iterator(options.stdin):

                    ninput += 1

                    keep = False
                    if by_gene:
                        keep = gff.gene_id in ids
                    if by_transcript:
                        keep = gff.transcript_id in ids
                    if (invert and keep) or (not invert and not keep):
                        continue

                    if ignore_strand:
                        gff.strand = "."

                    options.stdout.write("%s\n" % str(gff))
                    nfeatures += 1
                    noutput += 1

            elif options.sample_size:

                if options.filter_method == "gene":
                    iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin))
                elif options.filter_method == "transcript":
                    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
                if options.min_exons_length:
                    iterator = GTF.iterator_min_feature_length(
                        iterator, min_length=options.min_exons_length, feature="exon"
                    )

                data = [x for x in iterator]
                ninput = len(data)
                if len(data) > options.sample_size:
                    data = random.sample(data, options.sample_size)

                for d in data:
                    noutput += 1
                    for dd in d:
                        nfeatures += 1
                        options.stdout.write(str(dd) + "\n")

            else:
                assert False, "please supply either a filename "
                "with ids to filter with (--map-tsv-file) or a sample-size."

        elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"):
            # extract entries by transcript/gene biotype.
            # This filter uses a test on the source field (ENSEMBL pre v78)
            # a regular expression on the attributes (ENSEMBL >= v78).
            tag = {
                "proteincoding": "protein_coding",
                "processed-pseudogene": "processed_pseudogene",
                "lincrna": "lincRNA",
            }[options.filter_method]
            rx = re.compile('"%s"' % tag)
            if not options.invert_filter:
                f = lambda x: x.source == tag or rx.search(x.attributes)
            else:
                f = lambda x: x.source != tag and not rx.search(x.attributes)

            for gff in GTF.iterator(options.stdin):
                ninput += 1
                if f(gff):
                    options.stdout.write(str(gff) + "\n")
                    noutput += 1
                else:
                    ndiscarded += 1

    elif options.method == "exons2introns":

        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")
            input_ranges = Intervals.combine(cds_ranges + exon_ranges)

            if len(input_ranges) > 1:
                last = input_ranges[0][1]
                output_ranges = []
                for start, end in input_ranges[1:]:
                    output_ranges.append((last, start))
                    last = end

                if options.intron_border:
                    b = options.intron_border
                    output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges]

                if options.intron_min_length:
                    l = options.intron_min_length
                    output_ranges = [x for x in output_ranges if x[1] - x[0] > l]

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    entry.feature = "intron"
                    entry.start = start
                    entry.end = end
                    options.stdout.write("%s\n" % str(entry))
                    nfeatures += 1
                noutput += 1
            else:
                ndiscarded += 1

    elif options.method == "set-score-to-distance":

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            strand = Genomics.convertStrand(gffs[0].strand)
            all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs])

            if strand != ".":
                t = 0
                if strand == "-":
                    gffs.reverse()
                for gff in gffs:
                    gff.score = t
                    t += gff.end - gff.start

                if strand == "-":
                    gffs.reverse()
            for gff in gffs:
                options.stdout.write("%s\n" % str(gff))
                nfeatures += 1
            noutput += 1

    elif options.method == "remove-overlapping":

        index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r")))

        for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found = False
            for e in gffs:
                if index.contains(e.contig, e.start, e.end):
                    found = True
                    break

            if found:
                ndiscarded += 1
            else:
                noutput += 1
                for e in gffs:
                    nfeatures += 1
                    options.stdout.write("%s\n" % str(e))

    elif options.method == "intersect-transcripts":

        for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1
            r = []
            for g in gffs:
                if options.with_utr:
                    ranges = GTF.asRanges(g, "exon")
                else:
                    ranges = GTF.asRanges(g, "CDS")
                r.append(ranges)

            result = r[0]
            for x in r[1:]:
                result = Intervals.intersect(result, x)

            entry = GTF.Entry()
            entry.copy(gffs[0][0])
            entry.clearAttributes()
            entry.transcript_id = "merged"
            entry.feature = "exon"
            for start, end in result:
                entry.start = start
                entry.end = end
                options.stdout.write("%s\n" % str(entry))
                nfeatures += 1

            noutput += 1

    elif "rename-duplicates" == options.method:
        # note: this will only rename entries with "CDS" in feature column

        assert options.duplicate_feature in ["gene", "transcript", "both"], (
            "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'"
        )

        gene_ids = list()
        transcript_ids = list()
        gtfs = list()

        for gtf in GTF.iterator(options.stdin):
            gtfs.append(gtf)
            if gtf.feature == "CDS":
                gene_ids.append(gtf.gene_id)
                transcript_ids.append(gtf.transcript_id)

        dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1]
        dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1]

        E.info("Number of duplicated gene_ids: %i" % len(dup_gene))
        E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript))

        gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene))))
        transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript))))

        for gtf in gtfs:
            if gtf.feature == "CDS":
                if options.duplicate_feature in ["both", "gene"]:
                    if gtf.gene_id in dup_gene:
                        gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1
                        gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id]))

                if options.duplicate_feature in ["both", "transcript"]:
                    if gtf.transcript_id in dup_transcript:
                        transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1
                        gtf.setAttribute(
                            "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])
                        )

            options.stdout.write("%s\n" % gtf)

    elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"):
        for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict):

            ninput += 1

            cds_ranges = GTF.asRanges(gffs, "CDS")
            exon_ranges = GTF.asRanges(gffs, "exon")

            # sanity checks
            strands = set([x.strand for x in gffs])
            contigs = set([x.contig for x in gffs])
            if len(strands) > 1:
                raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands)))

            if len(contigs) > 1:
                raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs)))

            strand = Genomics.convertStrand(gffs[0].strand)

            if cds_ranges and options.with_utr:
                cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1]
                midpoint = (cds_end - cds_start) / 2 + cds_start

                utr_ranges = []
                for start, end in Intervals.truncate(exon_ranges, cds_ranges):
                    if end - start > 3:
                        if strand == ".":
                            feature = "UTR"
                        elif strand == "+":
                            if start < midpoint:
                                feature = "UTR5"
                            else:
                                feature = "UTR3"
                        elif strand == "-":
                            if start < midpoint:
                                feature = "UTR3"
                            else:
                                feature = "UTR5"
                        utr_ranges.append((feature, start, end))
                output_feature = "CDS"
                output_ranges = cds_ranges
            else:
                output_feature = "exon"
                output_ranges = exon_ranges
                utr_ranges = []

            result = []

            try:
                biotypes = [x["gene_biotype"] for x in gffs]
                biotype = ":".join(set(biotypes))
            except (KeyError, AttributeError):
                biotype = None

            if options.method == "merge-exons":
                # need to combine per feature - skip
                # utr_ranges = Intervals.combineAtDistance(
                # utr_ranges,
                # options.merge_exons_distance)

                output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance)

                for feature, start, end in utr_ranges:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.feature = feature
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = start
                    entry.end = end
                    result.append(entry)

                for start, end in output_ranges:

                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = "merged"
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.feature = output_feature
                    entry.start = start
                    entry.end = end
                    result.append(entry)

            elif options.method == "merge-transcripts":

                entry = GTF.Entry()
                entry.copy(gffs[0])
                entry.clearAttributes()
                entry.transcript_id = entry.gene_id
                if biotype:
                    entry.addAttribute("gene_biotype", biotype)
                entry.start = output_ranges[0][0]
                entry.end = output_ranges[-1][1]
                result.append(entry)

            elif options.method == "merge-introns":

                if len(output_ranges) >= 2:
                    entry = GTF.Entry()
                    entry.copy(gffs[0])
                    entry.clearAttributes()
                    entry.transcript_id = entry.gene_id
                    if biotype:
                        entry.addAttribute("gene_biotype", biotype)
                    entry.start = output_ranges[0][1]
                    entry.end = output_ranges[-1][0]
                    result.append(entry)
                else:
                    ndiscarded += 1
                    continue

            result.sort(key=lambda x: x.start)

            for x in result:
                options.stdout.write("%s\n" % str(x))
                nfeatures += 1
            noutput += 1

    elif options.method == "find-retained-introns":

        for gene in GTF.gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            found_any = False
            for intron in find_retained_introns(gene):
                found_any = True
                options.stdout.write("%s\n" % str(intron))
                nfeatures += 1
            if found_any:
                noutput += 1

    elif options.method == "genes-to-unique-chunks":

        for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):
            ninput += 1
            for exon in gene_to_blocks(gene):
                options.stdout.write("%s\n" % str(exon))
                nfeatures += 1
            noutput += 1

    else:
        raise ValueError("unknown method '%s'" % options.method)

    E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded))
    E.Stop()