Esempio n. 1
0
    def processChunk(prediction_id, gene_id, contig, strand, regions):

        if gene_id == None: return

        ## re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = max(map(lambda x: x[1], regions))
            regions = map(
                lambda x: (coordinate_offset - x[1], coordinate_offset - x[0]),
                regions)
            regions.sort()
        else:
            coordinate_offset = 0

        offset = 0
        for start, end in regions:
            start -= offset
            for x in range(start + 2, end, 3):
                if coordinate_offset:
                    # the factor -1 results from the open/closed
                    # bracket notation
                    c = coordinate_offset - x - 1
                else:
                    c = 0
                locations.append(
                    (prediction_id, gene_id, contig, strand, c, c + 1))
            offset = (end - start) % 3

        if (offset != 0):
            if options.loglevel >= 1:
                options.stdlog.write(
                    "# WARNING: prediction=%s, gene=%s on %s:%s : frame did not add up\n"
                    % (prediction_id, gene_id, contig, strand))
Esempio n. 2
0
def annotateRegulons( iterator, fasta, tss, options ):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator( iterator )

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand )
        lcontig = fasta.getLength( gene[0][0].contig )
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] )
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = ( min( lcontig, max( 0, interval[0] ) ),
                         min( lcontig, max( 0, interval[1] ) ) )
            
            regulons.append( interval )
            transcript_ids.append( transcript[0].transcript_id )

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have changed)
            regulons = Intervals.combine( regulons )
            transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )]
            
        gtf = GTF.Entry()
        gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id )
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write( "%s\n" % str(gtf) )
            nregulons += 1
            x += 1

    E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )
Esempio n. 3
0
def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))
Esempio n. 4
0
def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))
Esempio n. 5
0
def transform_third_codon(start, end, intervals_with_gff):
    """transform: only return nucleotide positions in window (start, end) 
    that are in third codon position.
    """
    intervals = []
    for istart, iend, gff in intervals_with_gff:

        if gff.frame == ".":
            raise ValueError("need a frame for third codon positions.")

        # frame = nucleotides from start to next codon
        frame = int(gff.frame)

        # to make life easier, convert to 0-based coordinates,
        # with zero starting at first position in window
        # re-arrange positions on negative strand
        if Genomics.IsNegativeStrand(gff.strand):
            # convert to negative strand coordinates counting from 0
            coordinate_offset = end
            reverse = True
            istart, iend = end - iend, end - istart
        else:
            istart, iend = istart - start, iend - start
            reverse = False
            coordinate_offset = start

        # make sure that you start on a second codon position and within window
        if istart < 0:
            frame = (frame + istart) % 3
            istart = 0
        if frame != 0:
            istart -= (3 - frame)
        istart += 2

        iend = min(iend, end - start)

        for x in range(istart, iend, 3):

            if reverse:
                c = coordinate_offset - x - 1
            else:
                c = coordinate_offset + x
            intervals.append((c, c + 1))

    return Intervals.combineIntervals(intervals)
Esempio n. 6
0
    def buildSequenceVariants(self, seq, strand, pos, snp):
        '''build new sequence by modifying a sequence fragment in seq at
        pos with snp.

        It is assumed that seq is already oriented according to strand.
        The strand is used to revert the snp if necessary.

        Note that only sequences different from seq will be returned.

        returns is_homozygous, seqs
        '''
        is_negative_strand = Genomics.IsNegativeStrand(strand)
        reference_base = snp.reference_base

        if reference_base != "*" and is_negative_strand:
            reference_base = Genomics.complement(reference_base)

        new_sequences = []
        is_homozygous = True
        if reference_base != "*":
            if seq[pos].upper() != reference_base.upper():
                raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" %
                                 (snp.pos, reference_base, seq[pos], seq, pos,
                                  ";".join(map(str, snp))))

            # single base changes
            variant_bases = Genomics.resolveAmbiguousNA(snp.genotype)
            if len(variant_bases) == 1:
                is_homozygous = True
            else:
                is_homozygous = False

            for variant_base in variant_bases:
                if is_negative_strand:
                    variant_base = Genomics.complement(variant_base)

                s = list(seq)
                s[pos] = variant_base
                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)
        else:
            variants = snp.genotype.split("/")
            is_homozygous = False
            for variant in variants:

                s = list(seq)
                # samtools denotes insert/deletion after position
                # while python is before/at position, hence the pos+1
                if variant[0] == "+":
                    toinsert = variant[1:].upper()
                    if is_negative_strand:
                        toinsert = Genomics.complement(toinsert)
                        s.insert(pos, toinsert)
                    else:
                        s.insert(pos + 1, toinsert)

                elif variant[0] == "-":
                    # pos+1+len(x)-1 = pos+len(x)
                    todelete = variant[1:].upper()
                    l = len(todelete)
                    if is_negative_strand:
                        # delete left of pos
                        xstart = max(0, pos - l)
                        xend = pos
                        todelete = todelete[:min(l, pos)]
                    else:
                        # delete right of pos
                        xstart = pos + 1
                        xend = min(self.mSize, pos + 1 + l)
                        todelete = todelete[:self.mSize - (pos + 1)]

                    deleted = "".join(s[xstart:xend])

                    if is_negative_strand:
                        deleted = Genomics.complement(deleted)

                    if deleted != todelete:
                        raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" %
                                         (snp.pos, todelete, deleted, seq, pos, xstart, xend,
                                          is_negative_strand,
                                          ";".join(map(str, snp))))
                    del s[xstart:xend]

                elif variant[0] == "*":
                    is_homozygous = True
                else:
                    raise ValueError("unknown variant sign '%s'" % variant[0])

                s = "".join(s)
                if s != seq:
                    new_sequences.append(s)

        return is_homozygous, new_sequences
Esempio n. 7
0
def annotateGenes(iterator, fasta, options):
    """annotate gene structures

    This method outputs intervals for first/middle/last exon/intron,
    UTRs and flanking regions.

    This method annotates per transcript. In order to achieve a unique tiling,
    use only a single transcript per gene and remove any overlap between
    genes.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, nskipped = 0, 0, 0

    results = []
    increment = options.increment

    introns_detail = "introns" in options.detail
    exons_detail = "exons" in options.detail

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        try:
            lcontig = fasta.getLength(gene[0][0].contig)
        except KeyError:
            nskipped += 1
            continue

        results = []

        for transcript in gene:

            def _add(interval, anno):
                gtf = GTF.Entry()
                gtf.contig = transcript[0].contig
                gtf.gene_id = transcript[0].gene_id
                gtf.transcript_id = transcript[0].transcript_id
                gtf.strand = transcript[0].strand
                gtf.feature = anno
                gtf.start, gtf.end = interval
                results.append(gtf)

            ntranscripts += 1

            exons = [(x.start, x.end)
                     for x in transcript if x.feature == "exon"]
            if len(exons) == 0:
                nskipped += 1

            exons.sort()
            introns = []
            end = exons[0][1]
            for exon in exons[1:]:
                introns.append((end, exon[0]))
                end = exon[1]

            # add flank
            start, end = exons[0][0], exons[-1][1]
            upstream, downstream = [], []
            for x in range(0, options.flank, increment):
                upstream.append((start - increment, start))
                start -= increment
                downstream.append((end, end + increment))
                end += increment

            # remove out-of-bounds coordinates
            upstream = [x for x in upstream if x[0] >= 0]
            downstream = [x for x in downstream if x[1] <= lcontig]

            if is_negative_strand:
                exons.reverse()
                introns.reverse()
                upstream, downstream = downstream, upstream

            # add exons
            if exons_detail:
                _add(exons[0], "first_exon")
                if len(exons) > 1:
                    _add(exons[-1], "last_exon")
                for e in exons[1:-1]:
                    _add(e, "middle_exon")
            else:
                for e in exons:
                    _add(e, "exon")

            # add introns
            if introns_detail:
                if len(introns) > 0:
                    _add(introns[0], "first_intron")
                if len(introns) > 1:
                    _add(introns[-1], "last_intron")
                for i in introns[1:-1]:
                    _add(i, "middle_intron")
            else:
                for i in introns:
                    _add(i, "intron")

            for x, u in enumerate(upstream):
                _add(u, "upstream_%i" % (increment * (x + 1)))

            for x, u in enumerate(downstream):
                _add(u, "downstream_%i" % (increment * (x + 1)))

            results.sort(key=lambda x: x.feature)

        cache = []
        for key, vals in itertools.groupby(results, key=lambda x: x.feature):
            v = list(vals)
            intervals = [(x.start, x.end) for x in v]
            intervals = Intervals.combine(intervals)

            for start, end in intervals:
                r = GTF.Entry()
                r.copy(v[0])
                r.start, r.end = start, end
                cache.append(r)

        cache.sort(key=lambda x: x.start)
        for r in cache:
            options.stdout.write("%s\n" % str(r))

    E.info("ngenes=%i, ntranscripts=%i, nskipped=%i\n" %
           (ngenes, ntranscripts, nskipped))
Esempio n. 8
0
def annotateGREATDomains(iterator, fasta, options):
    """build great domains

    extend from TSS a basal region.

    """

    gene_iterator = GTF.gene_iterator(iterator)

    counter = E.Counter()

    upstream, downstream = options.upstream, options.downstream
    radius = options.radius
    outfile = options.stdout

    regions = []
    ####################################################################
    # define basal regions for each gene
    # take all basal regions per transcript and merge them
    # Thus, the basal region of a gene might be larger than the sum
    # of options.upstream + options.downstream
    for gene in gene_iterator:
        counter.genes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)

        lcontig = fasta.getLength(gene[0][0].contig)
        regulons = []
        transcript_ids = []

        # collect every basal region per transcript
        for transcript in gene:
            counter.transcripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            # add range to both sides of tss
            if is_negative_strand:
                interval = ma - options.downstream, ma + options.upstream
            else:
                interval = mi - options.upstream, mi + options.downstream

            interval = (min(lcontig, max(0, interval[0])),
                        min(lcontig, max(0, interval[1])))

            regulons.append(interval)
            transcript_ids.append(transcript[0].transcript_id)

        # take first/last entry
        start, end = min(x[0] for x in regulons), max(x[1] for x in regulons)

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "greatdomain"
        gtf.start, gtf.end = start, end
        regions.append(gtf)

    regions.sort(key=lambda x: (x.contig, x.start))

    outf = IOTools.openFile("test.gff", "w")
    for x in regions:
        outf.write(str(x) + "\n")
    outf.close()

    ####################################################################
    # extend basal regions
    regions.sort(key=lambda x: (x.contig, x.start))

    # iterate within groups of overlapping basal regions
    groups = list(GTF.iterator_overlaps(iter(regions)))
    counter.groups = len(groups)

    last_end = 0
    reset = False

    for region_id, group in enumerate(groups):

        # collect basal intervals in group
        intervals = [(x.start, x.end) for x in group]

        def overlapsBasalRegion(pos):
            for start, end in intervals:
                if start == pos or end == pos:
                    continue
                if start <= pos < end:
                    return True
                if start > pos:
                    return False
            return False

        # deal with boundary cases - end of contig
        if region_id < len(groups) - 1:
            nxt = groups[region_id + 1]
            if nxt[0].contig == group[0].contig:
                next_start = min([x.start for x in nxt])
            else:
                next_start = fasta.getLength(group[0].contig)
                reset = True
        else:
            next_start = fasta.getLength(group[0].contig)
            reset = True

        # last_end = basal extension of previous group
        # next_start = basal_extension of next group

        # extend region to previous/next group always extend
        # dowstream, but upstream only extend if basal region of an
        # interval is not already overlapping another basal region
        # within the group
        save_end = 0
        for gtf in group:
            save_end = max(save_end, gtf.end)
            if gtf.strand == "+":
                if not overlapsBasalRegion(gtf.start):
                    gtf.start = max(gtf.start - radius, last_end)
                # always extend downstream
                gtf.end = min(gtf.end + radius, next_start)
            else:
                # always extend downstream
                gtf.start = max(gtf.start - radius, last_end)
                if not overlapsBasalRegion(gtf.end):
                    gtf.end = min(gtf.end + radius, next_start)
            outfile.write(str(gtf) + "\n")
            counter.regulons += 1

        if len(group) > 1:
            counter.overlaps += len(group)
        else:
            counter.nonoverlaps += 1

        if reset:
            last_end = 0
            reset = False
        else:
            last_end = save_end

    E.info("%s" % str(counter))
Esempio n. 9
0
         margin_sbjct_from, margin_sbjct_to,
         query_token, sbjct_token, sbjct_strand ) = segment

        try:
            lgenome = fasta.getLength( sbjct_token )
        except KeyError:
            nunknown += 1
            if sbjct_token not in unknown: unknown[sbjct_token] = 0
            unknown[sbjct_token] += 1
            continue
            
        min_sbjct_to = min( min_sbjct_to, lgenome )
        margin_sbjct_to = min( margin_sbjct_to, lgenome )

        if options.forward_coordinates:
            if Genomics.IsNegativeStrand( sbjct_strand ):
                margin_sbjct_from, margin_sbjct_to = lgenome-margin_sbjct_to, lgenome-margin_sbjct_from
                min_sbjct_from, min_sbjct_to = lgenome-min_sbjct_to, lgenome-min_sbjct_from            
        
        if options.no_sequence:
            fragment = ""
        else:
            # get genomic sequence
            fragment = fasta.getSequence( sbjct_token, sbjct_strand,
                                          margin_sbjct_from, margin_sbjct_to,
                                          as_array = False)
        
        if peptide_sequences.has_key( query_token ):
            peptide_sequence = peptide_sequences[query_token]
        else:
            peptide_sequence = None
Esempio n. 10
0
def processEntries(name, entries, options, fasta, contigs):

    ## reorder, if negative strand
    # if Genomics.IsNegativeStrand( entries[0].strand ):
    # entries.reverse()

    is_negative = Genomics.IsNegativeStrand(entries[0].strand)

    contig = entries[0].contig

    lcontig = contigs[contig]

    # sort in-order in transcript
    entries.sort(key=lambda x: x.start)
    if is_negative: entries.reverse()

    for gff in entries:
        if gff.end > lcontig or gff.start >= lcontig:
            E.warn( "coordinates for %s on %s out of bounds (%i:%i > %i)" % \
                        (str(gff.mAttributes), contig, gff.start, gff.end, lcontig) )
            return False

    if options.convert_to_cds:
        cds_start = 0
        t = 0
        for gff in entries:
            t += gff.end - gff.start

        options.stdout.write("\t".join(
            map(str, (
                name,
                entries[0].contig,
                "+",
                1,
                0,
                cds_start,
                t,
                cds_start,
                t,
            ))) + "\n")
    else:
        n = 0
        cds_start = 0
        cds_end = 0

        if options.reset_coordinates:
            if is_negative:
                offset = -(entries[-1].start)
            else:
                offset = -(entries[0].start)
        else:
            offset = 0

        for gff in entries:
            n += 1
            cds_end += gff.end - gff.start

            if offset:
                gff.start += offset
                gff.end += offset

            options.stdout.write("\t".join(
                map(str, (name, gff.contig, gff.strand, gff.frame, n,
                          cds_start, cds_end, gff.start, gff.end))) + "\n")

            cds_start = cds_end

    return True