Python GTF.Entry 예제들, CGAT.GTF.Entry Python 예제들

예제 #1

0

파일 보기

파일: annotator_distance_test.py 프로젝트: logust79/cgat-apps

    def setUp(self):
        AnnotatorDistanceCheck.setUp(self)

        outfile = open( self.workspace, "w" )
        e = GTF.Entry()

        e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"

        # 10kb genes every 100000kb for 10Mb
        for x in range( 100000, 10000000, 100000 ):
            e.gene_id, e.transcript_id = "gene%i" % x, "trans1"
            e.start, e.end = x, x + 10000
            outfile.write( str(e) + "\n" )
        outfile.close()

        # segments: concentrated at 5' end
        outfile = open( self.segments, "w" )
        e = GTF.Entry()
        e.contig, e.strand  = "chr1", "+"
        for x in range( 110000, 10000000, 100000 ):
            y = x
            inc = 200
            while y < x + 100000:
                e.gene_id, e.transcript_id = "gene%i" % (y), "trans1"
                e.start, e.end = y, y+random.randint( 50, 150 )
                outfile.write( str(e) + "\n" )
                y += inc
                inc += random.randint( 0, 100 )

        outfile.close()

예제 #2

0

파일 보기

파일: annotator_distance_test.py 프로젝트: logust79/cgat-apps

    def setUp(self):
        self.tmpdir = tempfile.mkdtemp()
        self.workspace = os.path.join( self.tmpdir, "workspace.gtf" )
        self.segments = os.path.join( self.tmpdir, "segments.gtf" )

        outfile = open( self.workspace, "w" )
        e = GTF.Entry()

        e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"

        # 10kb genes every 100000kb for 10Mb
        for x in range( 100000, 10000000, 100000 ):
            e.gene_id, e.transcript_id = "gene%i" % x, "trans1"
            e.start, e.end = x, x + 10000
            outfile.write( str(e) + "\n" )
        outfile.close()

        # segments: uniformly distributed every 1kb with random length
        outfile = open( self.segments, "w" )
        e = GTF.Entry()
        e.contig, e.strand  = "chr1", "+"
        for x in range( 0, 10000000, 1000 ):
            e.gene_id, e.transcript_id = "gene%i" % x, "trans1"
            e.start, e.end = x, x+random.randint( 50, 150 )
            outfile.write( str(e) + "\n" )
            
        outfile.close()

예제 #3

0

파일 보기

def targetScanParse(infile, lnc_gtf):
    '''
    Parse results from targetScan into GTF
    '''
    gtf_dict = {}
    lnc_file = IOTools.openFile(lnc_gtf)
    for each in GTF.transcript_iterator(GTF.iterator(lnc_file)):
        for trans in each:
            entry = GTF.Entry()
            entry = entry.copy(trans)
            gtf_dict[entry.transcript_id] = entry

    lnc_file.close()
    counter = 0
    for line in infile:
        counter += 1
        line = line.split("\t")
        if counter > 1:
            MRE = GTF.Entry()
            gene_id = line[0].lstrip('"').rstrip('"')
            target = gtf_dict[gene_id]
            align_start = int(line[3])
            align_end = int(line[4])
            size = align_end - align_start
            miRNA = "mmu-%s" % line[1]
            seed_class = line[8]

            MRE.contig = target.contig
            MRE.feature = "MRE"
            MRE.start = target.start + align_start
            MRE.end = MRE.start + size
            MRE.source = target.source
            MRE.strand = target.strand
            MRE.addAttribute('miRNA', miRNA)
            MRE.addAttribute('target', gene_id)
            try:
                MRE.addAttribute('exon_number', target.asDict()['exon_number'])
            except KeyError:
                E.info("No exon number data in GTF for %s" % gene_id)
                MRE.addAttribute('exon_number', '.')

            if target.source == "protein_coding":
                MRE.addAttribute('exon_status', "protein_coding")
            else:
                try:
                    MRE.addAttribute('exon_status',
                                     target.asDict()['exon_status'])
                except KeyError:
                    E.info("No exon status data in GTF for  %s" % gene_id)
                    MRE.addAttribute('exon_status', '.')

            MRE.transcript_id = "%s_%s:%i-%i" % (gene_id, MRE.contig,
                                                 MRE.start, MRE.end)
            MRE.gene_id = "%s_%s:%i-%i" % (miRNA, MRE.contig, MRE.start,
                                           MRE.end)
            MRE.addAttribute('seed_class', seed_class)

            yield MRE

예제 #4

0

파일 보기

파일: fasta2gff.py 프로젝트: gsc0107/cgat

def main(argv=None):

    parser = E.OptionParser(
        version="%prog version: $Id: fasta2gff.py 2861 2010-02-23 17:36:32Z andreas $")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true",
                      help="output as gtf.")

    parser.add_option("-f", "--fragment-size", dest="fragment_size", type="int",
                      help="fixed size of fragments [default=%default].")

    parser.add_option("-s", "--sample-size", dest="sample_size", type="int",
                      help="fixed size of fragments.")

    parser.set_defaults(
        as_gtf=False,
        genome_file=None,
        fragment_size=1000,
        sample_size=10000,
        pattern_id="%08i",
    )

    (options, args) = E.Start(parser)

    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contigs = fasta.getContigSizes()

    if options.as_gtf:
        entry = GTF.Entry()
    else:
        entry = GTF.Entry()

    n = 0
    entry.feature = "exon"
    entry.source = "random"

    for x in range(options.sample_size):

        entry.contig, entry.strand, entry.start, entry.end = fasta.getRandomCoordinates(
            options.fragment_size)

        if entry.strand == "-":
            l = contigs[entry.contig]
            entry.start, entry.end = l - entry.end, l - entry.start

        if options.as_gtf:
            entry.gene_id = options.pattern_id % n
            entry.transcript_id = entry.gene_id

        options.stdout.write(str(entry) + "\n")
        n += 1

    E.Stop()

예제 #5

0

파일 보기

파일: gtf2gff.py 프로젝트: gsc0107/cgat

def annotateExons(iterator, fasta, options):
    """annotate exons within iterator."""

    gene_iterator = GTF.gene_iterator(iterator)

    ninput, noutput, noverlapping = 0, 0, 0

    for this in gene_iterator:
        ninput += 1
        intervals = collections.defaultdict(list)
        ntranscripts = len(this)

        is_negative_strand = Genomics.IsNegativeStrand(this[0][0].strand)

        for exons in this:
            # make sure these are sorted correctly
            exons.sort(key=lambda x: x.start)
            if is_negative_strand:
                exons.reverse()

            nexons = len(exons)
            for i, e in enumerate(exons):
                intervals[(e.start, e.end)].append((i + 1, nexons))

        gtf = GTF.Entry()
        gtf.fromGTF(this[0][0], this[0][0].gene_id, this[0][0].gene_id)
        gtf.addAttribute("ntranscripts", ntranscripts)

        gtfs = []
        for r, pos in intervals.items():

            g = GTF.Entry().copy(gtf)
            g.start, g.end = r
            g.addAttribute("nused", len(pos))
            g.addAttribute("pos", ",".join(["%i:%i" % x for x in pos]))
            gtfs.append(g)

        gtfs.sort(key=lambda x: x.start)

        for g in gtfs:
            options.stdout.write("%s\n" % str(g))

        # check for exon overlap
        intervals = [(g.start, g.end) for g in gtfs]
        nbefore = len(intervals)
        nafter = len(Intervals.combine(intervals))
        if nafter != nbefore:
            noverlapping += 1

        noutput += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ninput=%i, noutput=%i, noverlapping=%i\n" % (ninput, noutput, noverlapping))

예제 #6

0

파일 보기

def UTR5(transcript):

    exons = GTF.asRanges(transcript, "exon")
    cds = GTF.asRanges(transcript, "CDS")

    utrs = Intervals.truncate(exons, cds)

    if len(cds) == 0:
        return list()

    if transcript[0].strand == "-":
        utr3 = [exon for exon in utrs if exon[0] >= cds[-1][1]]
    else:
        utr3 = [exon for exon in utrs if exon[-1] <= cds[0][0]]

    for e in transcript:
        if e.feature == "exon":
            template_exon = e
            break

    returned_exons = []
    for e in utr3:
        gtf = GTF.Entry().fromGTF(template_exon)
        gtf.start = e[0]
        gtf.end = e[1]
        returned_exons.append(gtf)

    return returned_exons

예제 #7

0

파일 보기

                def writeGFF(blocks, first, filename):

                    outfile.write("writing gff entries to %s\n" % filename)

                    outfile_gff = open(filename, "w")

                    entry = GTF.Entry()
                    entry.source = "gpipe"
                    entry.feature = "synteny"

                    for b in range(len(blocks)):
                        block = blocks[b]
                        if first:
                            entry.name = block.contig1
                            entry.start = block.mFrom1
                            entry.end = block.mTo1
                        else:
                            entry.name = block.contig2
                            entry.start = block.mFrom2
                            entry.end = block.mTo2

                        entry.info = "Block=%i" % block.mBlockId

                        outfile_gff.write(str(entry) + "\n")

                    outfile_gff.close()

예제 #8

0

파일 보기

    def update( self, bed ):

        # convert to a gtf entry
        gtf = GTF.Entry()
        gtf.fromBed( bed )
        gtf.feature = 'exon'
        gtf2table.Classifier.update( self, [gtf] )

예제 #9

0

파일 보기

파일: annotator_distance_test.py 프로젝트: logust79/cgat-apps

    def setUp( self ):
 
        AnnotatorDistanceCheck.setUp(self)
        
        outfile = open( self.workspace, "w" )
        e = GTF.Entry()

        e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"

        e.start, e.end = 0, 1000
        outfile.write( str(e) + "\n" )

        e.start, e.end = 3000, 4000
        outfile.write( str(e) + "\n" )

        e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene2", "trans1"

        e.start, e.end = 10000, 11000
        outfile.write( str(e) + "\n" )

        e.start, e.end = 13000, 14000
        outfile.write( str(e) + "\n" )

        e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "-", "gene3", "trans1"

        e.start, e.end = 20000, 21000
        outfile.write( str(e) + "\n" )

        e.start, e.end = 23000, 24000
        outfile.write( str(e) + "\n" )

        outfile.close()

예제 #10

0

파일 보기

파일: gtf2gff.py 프로젝트: yangjl/cgat

def annotateRegulons( iterator, fasta, tss, options ):
    """annotate regulons within iterator.

    Entries specied with ``--restrict-source`` are annotated.
    """

    gene_iterator = GTF.gene_iterator( iterator )

    ngenes, ntranscripts, nregulons = 0, 0, 0

    upstream, downstream = options.upstream, options.downstream

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand )
        lcontig = fasta.getLength( gene[0][0].contig )
        regulons = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] )
            if tss:
                # add range to both sides of tss
                if is_negative_strand:
                    interval = ma - options.downstream, ma + options.upstream
                else:
                    interval = mi - options.upstream, mi + options.downstream
            else:
                # add range to both sides of tts
                if is_negative_strand:
                    interval = mi - options.downstream, mi + options.upstream
                else:
                    interval = ma - options.upstream, ma + options.downstream

            interval = ( min( lcontig, max( 0, interval[0] ) ),
                         min( lcontig, max( 0, interval[1] ) ) )
            
            regulons.append( interval )
            transcript_ids.append( transcript[0].transcript_id )

        if options.merge_promotors:
            # merge the regulons (and rename - as sort order might have changed)
            regulons = Intervals.combine( regulons )
            transcript_ids = ["%i" % (x+1) for x in range(len(regulons) )]
            
        gtf = GTF.Entry()
        gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id )
        gtf.source = "regulon"

        x = 0
        for start, end in regulons:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write( "%s\n" % str(gtf) )
            nregulons += 1
            x += 1

    E.info( "ngenes=%i, ntranscripts=%i, nregulons=%i" % (ngenes, ntranscripts, nregulons) )

예제 #11

0

파일 보기

파일: bed2table.py 프로젝트: gsc0107/cgat

    def update(self, bed):

        # convert to a gtf entry
        gtf = GTF.Entry()

        gtf.fromBed(bed)
        gtf.feature = 'exon'
        GeneModelAnalysis.Classifier.update(self, [gtf])

예제 #12

0

파일 보기

파일: snp2counts_test.py 프로젝트: logust79/cgat-apps

    def setUp(self):

        self.mExons = []

        self.mSplitCodonsNext = {}
        self.mSplitCodonsPrev = {}

        self.mSpliceSize = 4
        self.mExonSize = 100
        self.mIntronSize = 900
        self.strand = "+"
        self.mNExons = 9
        self.mOffset = 1000
        length = 0
        self.frame = 0
        self.mIncrement = self.mIntronSize + self.mExonSize

        seq = list("123" * int((self.mNExons * self.mExonSize) / 3))

        exon_id = 0

        start = self.mOffset
        for x in range(self.mNExons):

            e = GTF.Entry()
            e.contig, e.strand, e.gene_id, e.transcript_id = "chr1", "+", "gene1", "trans1"
            e.start, e.end = start, start + self.mExonSize
            e.frame = (3 - (length % 3)) % 3
            length += e.end - e.start
            self.mExons.append(e)
            if e.frame != 0:
                for y in range(0, e.frame):
                    self.mSplitCodonsPrev[start + y] = start - self.mIntronSize
                for y in range(0, 3 - e.frame):
                    self.mSplitCodonsNext[
                        start - self.mIntronSize - y - 1] = start

            exon_id += 1
            if exon_id < self.mNExons:
                p = exon_id * self.mExonSize + self.mIntronSize * (exon_id - 1)
                seq[p:p] = list("AG")
                seq[p:p] = list("T" * (self.mIntronSize - 4))
                seq[p:p] = list("GT")

            start += self.mIncrement
            # print str(e)
        # print self.mSplitCodonsNext
        # print self.mSplitCodonsPrev
        seq[0:0] = "C" * self.mOffset
        seq.append("G" * self.mOffset)
        tmpfile = tempfile.NamedTemporaryFile()
        tmpfile.close()

        seq = "".join(seq)
        self.mSequence = seq
        self.contigSize = len(seq)
        IndexedFasta.createDatabase(tmpfile.name, iter([("chr1", seq), ]))
        self.mFasta = IndexedFasta.IndexedFasta(tmpfile.name)

예제 #13

0

파일 보기

파일: gtf2gff.py 프로젝트: gsc0107/cgat

 def _add(interval, anno):
     gtf = GTF.Entry()
     gtf.contig = transcript[0].contig
     gtf.gene_id = transcript[0].gene_id
     gtf.transcript_id = transcript[0].transcript_id
     gtf.strand = transcript[0].strand
     gtf.feature = anno
     gtf.start, gtf.end = interval
     results.append(gtf)

예제 #14

0

파일 보기

파일: gff2table.py 프로젝트: kathrinjansen/cgat

    def test_entry(frame, strand, xfrom, xto, start, end, ref):

        entry = GTF.Entry()
        entry.frame = frame
        entry.strand = strand
        entry.start = xfrom
        entry.end = xto

        intervals = transform_third_codon(start, end, [(xfrom, xto, entry)])
        if ref != intervals:
            print("failed:", ref != intervals)

예제 #15

0

파일 보기

파일: gtf2gff.py 프로젝트: gsc0107/cgat

def annotateTTS(iterator, fasta, options):
    """annotate termination sites within iterator.

    Entries specified with ``--restrict-source are annotated``.
    """

    gene_iterator = GTF.gene_iterator(iterator)

    ngenes, ntranscripts, npromotors = 0, 0, 0

    for gene in gene_iterator:
        ngenes += 1
        is_negative_strand = Genomics.IsNegativeStrand(gene[0][0].strand)
        lcontig = fasta.getLength(gene[0][0].contig)
        tts = []
        transcript_ids = []
        for transcript in gene:

            ntranscripts += 1
            mi, ma = min([x.start for x in transcript]), max(
                [x.end for x in transcript])
            transcript_ids.append(transcript[0].transcript_id)
            # if tts is directly at start/end of contig, the tss will
            # be within an exon.  otherwise, it is outside an exon.
            if is_negative_strand:
                tts.append(
                    (max(0, mi - options.promotor), max(options.promotor, mi)))
            else:
                tts.append(
                    (min(ma, lcontig - options.promotor),
                     min(lcontig, ma + options.promotor)))

        if options.merge_promotors:
            # merge the promotors (and rename - as sort order might have
            # changed)
            tts = Intervals.combine(tts)
            transcript_ids = ["%i" % (x + 1) for x in range(len(tts))]

        gtf = GTF.Entry()
        gtf.fromGTF(gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id)
        gtf.source = "tts"

        x = 0
        for start, end in tts:
            gtf.start, gtf.end = start, end
            gtf.transcript_id = transcript_ids[x]
            options.stdout.write("%s\n" % str(gtf))
            npromotors += 1
            x += 1

    if options.loglevel >= 1:
        options.stdlog.write(
            "# ngenes=%i, ntranscripts=%i, ntss=%i\n" %
            (ngenes, ntranscripts, npromotors))

예제 #16

0

파일 보기

def CDS(transcript):

    CDS = [e for e in transcript if e.feature == "CDS"]

    if len(CDS) == 0:
        return list()

    returned_exons = [GTF.Entry().fromGTF(e) for e in CDS]
    for e in returned_exons:
        e.feature = "exon"

    return returned_exons

예제 #17

0

파일 보기

파일: annotator_distance_test.py 프로젝트: logust79/cgat-apps

    def setUp(self):
        AnnotatorDistanceCheck.setUp(self)

        work_outfile = open( self.workspace, "w" )
        segs_outfile = open( self.segments, "w" )

        w = GTF.Entry()
        w.contig, w.strand, w.gene_id, w.transcript_id = "chr1", "+", "gene1", "trans1"
        e = GTF.Entry()
        e.contig, e.strand  = "chr1", "+"

        # 10kb genes, size of intergenic space grows by random increment
        x, y = 0, 0
        w_inc = 0
        while x < 10000000:
            
            w.gene_id, w.transcript_id = "gene%i" % x, "trans1"
            e.start, e.end = x, x + 10000

            work_outfile.write( str(e) + "\n" )
            
            x += 10000
            w_inc += random.randint( 0, 10000)
            end = x + w_inc 
            y = x
            s_inc = 0
            while y < end:
                e.gene_id, e.transcript_id = "gene%i" % (y), "trans1"
                e.start, e.end = y, y+random.randint( 50, 150 )
                segs_outfile.write( str(e) + "\n" )
                y += s_inc
                s_inc += random.randint( 0, 100 )

            x = end

        work_outfile.close()
        segs_outfile.close()

예제 #18

0

파일 보기

    def process(self, contig, start, end, reads, qualities):

        entry = GTF.Entry()
        entry.start, entry.end = start, end
        entry.gene_id = self.mIdFormat % id
        entry.transcript_id = entry.gene_id
        entry.contig = contig
        entry.feature = "exon"
        entry.source = "maq"

        read_stats = Stats.Summary(reads)

        entry.score = "%5.2f" % read_stats['mean']

        self.mOutFile.write(str(entry) + "\n")

예제 #19

0

파일 보기

def buildRepeatTrack( infile, outfile ):
    '''build a repeat track as negative control.'''

    nrepeats = 0
    for gff in GFF.iterator( gzip.open(infile, "r" ) ): nrepeats+=1
    sample = set( random.sample( xrange( nrepeats), PARAMS["ancestral_repeats_samplesize"]) )

    outf = gzip.open( outfile, "w" )
    gtf = GTF.Entry()
    for x,gff in enumerate( GFF.iterator( gzip.open(infile, "r" ) ) ):
        if not x in sample: continue
        gtf.fromGFF( gff, "%08i" % x, "%08i" % x )
        outf.write( "%s\n" % str(gtf) )
    outf.close()

    E.debug( "created sample of %i repeats out of %i in %s" % (len(sample), nrepeats, outfile))

예제 #20

0

파일 보기

def tts(transcript, upstream=500, downstream=500):

    exons = [e for e in transcript if e.feature == "exon"]

    if exons[0].strand == "+":
        start = max(x.end for x in exons) - upstream
        end = start + upstream + downstream
    else:
        end = min(x.start for x in exons) + upstream
        start = end - upstream - downstream

    returned_exon = GTF.Entry().fromGTF(exons[0])
    returned_exon.start = start
    returned_exon.end = end

    return [returned_exon]

예제 #21

0

파일 보기

def convert_set(gffs, gene_pattern, transcript_pattern, options):
    ''' creates the gene_id and transcript_id fields from a string format pattern using
    fields of the gff. '''

    for gff in gffs:

        gff.gene_id = str(gene_pattern) % gff.asDict()
        gff.transcript_id = str(gene_pattern) % gff.asDict()

        gtf_entry = GTF.Entry()

        gtf_entry.copy(gff)
        if "Parent" in gtf_entry:
            gtf_entry['Parent'] = ",".join(gtf_entry['Parent'])

        options.stdout.write(str(gtf_entry) + "\n")

예제 #22

0

파일 보기

def flank3(transcript, length=500):

    exons = [e for e in transcript if e.feature == "exon"]

    if exons[0].strand == "+":
        start = max(x.end for x in exons)
        end = start + length
    else:
        end = min(x.start for x in exons)
        start = end - length

    returned_exon = GTF.Entry().fromGTF(exons[0])
    returned_exon.start = start
    returned_exon.end = end

    return [returned_exon]

예제 #23

0

파일 보기

def filterMREsTSV(input_file, filter_set):
    '''
    Filter MREs in a GFF file based on a list of
    miRNA IDs.  Return a generator object.
    '''

    mre_file = IOTools.openFile(input_file, "rb")
    for x in GTF.transcript_iterator(GTF.iterator(mre_file)):
        for mre in x:
            if mre.asDict()['miRNA'] in filter_set:
                entry = GTF.Entry()
                entry.copy(mre)
                yield entry
            else:
                pass
    mre_file.close()

예제 #24

0

파일 보기

파일: PipelineRI.py 프로젝트: sudlab/pipeline_retained_introns

def findRetainedIntrons(infile, outfile):

    outf = IOTools.openFile(outfile, "w")

    for gene in GTF.gene_iterator(GTF.iterator(IOTools.openFile(infile))):

        gene_out = []
        introns_out = []

        # now find if any of the transcripts are retained intron
        # versions of any of the others
        for first, second in itertools.product(gene, gene):

            first = sorted(
                [entry for entry in first if entry.feature == "exon"],
                key=lambda x: x.start)
            second = sorted(
                [entry for entry in second if entry.feature == "exon"],
                key=lambda x: x.start)

            first_introns = set(GTF.toIntronIntervals(first))
            second_introns = set(GTF.toIntronIntervals(second))

            if len(first_introns-second_introns) > 0 and \
               len(second_introns-first_introns) == 0:
                novel_introns = list(first_introns - second_introns)

                def _filterIntron(intron):
                    return intron[0] > second[0].start and \
                        intron[1] < second[-1].end

                novel_introns = filter(_filterIntron, novel_introns)

                if len(novel_introns) > 0:
                    gene_out.extend(first)

                for intron in novel_introns:
                    introns_out.append(intron)

        introns_out = Intervals.combine(introns_out)
        template = gene[0][0]
        template.feature = "exon"
        for gff in introns_out:
            entry = GTF.Entry().copy(template)
            entry.start = gff[0]
            entry.end = gff[1]
            outf.write("%s\n" % str(entry))

예제 #25

0

파일 보기

def introns(transcript):

    introns = GTF.toIntronIntervals(transcript)

    for e in transcript:
        if e.feature == "exon":
            template_exon = e
            break

    returned_exons = []
    for e in introns:
        gtf = GTF.Entry().fromGTF(template_exon)
        gtf.start = e[0]
        gtf.end = e[1]
        returned_exons.append(gtf)

    return returned_exons

예제 #26

0

파일 보기

def filter_overlapping_genes(infile, outfile):
    '''Filter out exons that overlapp with exons from another 
    gene'''

    tmp1 = P.getTempFilename()
    tmp2 = P.getTempFilename(shared=True)

    # the first command in the statment trancates exons that overlap
    # on oppsite strands. The second exons that overlap on the same
    # strand.  the first part of the second command identifies exons
    # that overlap on the same strand, the second part removes them
    # from the geneset.
    statement = ''' bedtools subtract -a %(infile)s -b %(infile)s -S > %(tmp1)s;

                    checkpoint;

                    bedtools merge -i <( sort -k1,1 -k4,4n %(tmp1)s)
                                   -c 6 -o count -d -2
                  | awk '$4>1'
                  | bedtools subtract -a %(tmp1)s -b stdin
                  | python %(scriptsdir)s/gtf2gtf.py
                           --method=set-transcript-to-gene
                           -L %(outfile)s.log
                  | python %(scriptsdir)s/gtf2gtf.py
                           --method=sort -L %(outfile)s.log
                  | gzip > %(tmp2)s;

                    checkpoint;

                    rm %(tmp1)s'''

    P.run()

    # renumber exons as new exons have probably been created.
    with IOTools.openFile(outfile, "w") as outf:
        for transcript in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile(tmp2))):
            nexon = 0
            for exon in transcript:
                nexon += 1
                exon = GTF.Entry().fromGTF(exon)
                exon["exon_id"] = int(nexon)
                outf.write(str(exon) + "\n")

    os.unlink(tmp2)

예제 #27

0

파일 보기

파일: annotator_distance_test.py 프로젝트: logust79/cgat-apps

    def setUp( self ):
 
        AnnotatorDistanceCheck.setUp(self)

        e = GTF.Entry()
        e.contig, e.strand = "chr1", "+"
        
        outfile = open( self.workspace, "w" )
        start, inc, size = 0, 1000, 100
        for x in range( 0, 2):
            start = x * 10000

            for y in range( 0, 3 ):
                e.gene_id, e.transcript_id = "gene_%i" % x , "transcript_%i" % y
                e.start, e.end = start, start + size
                outfile.write( str(e) + "\n" )
                start += inc

            if e.strand == "+": e.strand = "-"
            else: e.strand = "+"
        outfile.close()

예제 #28

0

파일 보기

파일: gtf2gff.py 프로젝트: yangjl/cgat

def addSegment( feature, start, end, template, options ):
    """add a generic segment of type *feature*.
    """
    if start >= end: return 0

    entry = GTF.Entry()

    if type(template) == types.TupleType:
        entry.copy( template[0] )        
        entry.clearAttributes()
        entry.addAttribute( "downstream_gene_id", template[1].gene_id )
    else:
        entry.copy( template )
        entry.clearAttributes()

    entry.start, entry.end = start, end
    entry.feature = feature
    if feature not in ("exon", "CDS", "UTR", "UTR3", "UTR5"):
        entry.score = "."
    options.stdout.write( str(entry) + "\n" )

    return 1

예제 #29

0

파일 보기

파일: pipeline_hvc.py 프로젝트: gsc0107/cgat

def exportSegments(infiles, outfile):

    track = outfile[:-len(".gtf")]

    outf = open(outfile, "w")

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    # ignores the attributes
    statement = """SELECT DISTINCT contig, end, feature, frame, s.gene_id, score, source, start, strand, transcript_id \
    FROM segments AS s, assignments AS a WHERE a.gene_id = s.gene_id and a.%(track)s""" % locals(
    )

    cc.execute(statement)

    for row in cc:
        gtf = GTF.Entry()
        gtf.contig, gtf.end, gtf.feature, gtf.frame, gtf.gene_id, gtf.score, gtf.source, gtf.start, gtf.strand, gtf.transcript_id =\
                     row
        outf.write(str(gtf) + "\n")

    outf.close()

예제 #30

0

파일 보기

def main():
    '''
    main function
    '''
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2tsv.py 2887 2010-04-07 08:48:04Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--only-attributes",
        dest="only_attributes",
        action="store_true",
        help="output attributes as separate columns [default=%default].")
    parser.add_option(
        "-f",
        "--full",
        dest="full",
        action="store_true",
        help="output attributes as separate columns [default=%default].")
    parser.add_option(
        "-i",
        "--invert",
        dest="invert",
        action="store_true",
        help="convert tab-separated table back to gtf [default=%default].")
    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="choice",
        choices=("transcript2gene", "peptide2gene", "peptide2transcript"),
        help="output a map mapping transcripts to genes [default=%default].")

    parser.set_defaults(
        only_attributes=False,
        full=False,
        invert=False,
        map=None,
    )

    (options, args) = E.Start(parser)

    if options.full:

        # output full table with column for each attribute
        attributes = set()
        data = []
        for gtf in GTF.iterator(options.stdin):
            data.append(gtf)
            attributes = attributes.union(set(gtf.keys()))

        # remove gene_id and transcript_id, as they are used
        # explicitely later
        attributes.difference_update(["gene_id", "transcript_id"])

        attributes = sorted(list(attributes))

        if options.only_attributes:
            header = ["gene_id", "transcript_id"] + attributes
        else:
            header = [
                "contig",
                "source",
                "feature",
                "start",
                "end",
                "score",
                "strand",
                "frame",
                "gene_id",
                "transcript_id",
            ] + attributes

        options.stdout.write("\t".join(header) + "\n")

        if options.only_attributes:
            for gtf in data:
                options.stdout.write("\t".join(
                    map(str, (
                        gtf.gene_id,
                        gtf.transcript_id,
                    ))))
                for a in attributes:
                    if a in ("gene_id", "transcript_id"): continue
                    try:
                        val = getattr(gtf, a)
                    except AttributeError:
                        val = ""
                    options.stdout.write("\t%s" % val)
                options.stdout.write("\n")
        else:
            for gtf in data:
                options.stdout.write("\t".join(
                    map(str, (
                        gtf.contig,
                        gtf.source,
                        gtf.feature,
                        gtf.start,
                        gtf.end,
                        gtf.score,
                        gtf.strand,
                        gtf.frame,
                        gtf.gene_id,
                        gtf.transcript_id,
                    ))))
                for a in attributes:
                    try:
                        val = getattr(gtf, a)
                    except AttributeError:
                        val = ""
                    options.stdout.write("\t%s" % val)
                options.stdout.write("\n")

    elif options.invert:

        gtf = GTF.Entry()
        header = None
        for line in options.stdin:
            if line.startswith("#"): continue
            data = line[:-1].split("\t")
            if not header:
                header = data
                map_header2column = dict([(y, x)
                                          for x, y in enumerate(header)])
                continue

            # fill gtf entry with data
            try:
                gtf.contig = data[map_header2column["contig"]]
                gtf.source = data[map_header2column["source"]]
                gtf.feature = data[map_header2column["feature"]]
                # subtract -1 to start for 0-based coordinates
                gtf.start = int(data[map_header2column["start"]])
                gtf.end = int(data[map_header2column["end"]])
                gtf.score = data[map_header2column["score"]]
                gtf.strand = data[map_header2column["strand"]]
                gtf.frame = data[map_header2column["frame"]]
                gtf.gene_id = data[map_header2column["gene_id"]]
                gtf.transcript_id = data[map_header2column["transcript_id"]]
                gtf.parseInfo(data[map_header2column["attributes"]], line)
            except KeyError, msg:
                raise KeyError("incomplete entry %s: %s: %s" %
                               (str(data), str(map_header2column), msg))
            # output gtf entry in gtf format
            options.stdout.write("%s\n" % str(gtf))