def makeTranscriptome(tranFN, outFN):

    p = bioLibCG.cgPrint()
    p.show = False
    gf = GenomeFetch.GenomeFetch('hg19')

    fOut = open(outFN, 'w')
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        tID = ls[0]
        gID = ls[10]

        seqList = []
        for eStart, eEnd in exonPairs:
            tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
            seqList.append(gf.getSequence(tcc))

        mRNA = ''.join(seqList)

        #reverse direction if negative strand
        if tStrand == '-1':
            mRNA = mRNA[::-1]

        fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
        fOut.write(mRNA + '\n\n')

    fOut.close()
    f.close()
def makeTranscriptome(tranFN, outFN):

        p = bioLibCG.cgPrint()                               
        p.show = False
        gf = GenomeFetch.GenomeFetch('hg19')
        

        fOut = open(outFN, 'w')
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                tID = ls[0]
                gID = ls[10]

                seqList = []
                for eStart, eEnd in exonPairs:
                        tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
                        seqList.append(gf.getSequence(tcc))

                mRNA = ''.join(seqList)

                #reverse direction if negative strand
                if tStrand == '-1':
                        mRNA = mRNA[::-1]

                fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
                fOut.write(mRNA + '\n\n')
                        
        fOut.close()
        f.close()
def checkMessy(tranFN):
        
        p = bioLibCG.cgPrint()                               
        f = open(tranFN, 'r')
        a = 0
        b = 0
        c = 0
        d = 0
        e = 0
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[15]
                tID = ls[0]

                #debug
                p.show = False

                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('5 is none')
                                b += 1
                                if codingStatus:
                                        d += 1
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('5 is none')
                                b += 1
                                if codingStatus:
                                        d += 1
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                c += 1 
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                c += 1
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)

                a += 1

        print a, b, c, d, e                
Exemple #4
0
def checkMessy(tranFN):

    p = bioLibCG.cgPrint()
    f = open(tranFN, 'r')
    a = 0
    b = 0
    c = 0
    d = 0
    e = 0
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[15]
        tID = ls[0]

        #debug
        p.show = False

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('5 is none')
                b += 1
                if codingStatus:
                    d += 1
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('5 is none')
                b += 1
                if codingStatus:
                    d += 1
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('3 is none')
                c += 1
                if codingStatus:
                    e += 1
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('3 is none')
                c += 1
                if codingStatus:
                    e += 1
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        a += 1

    print a, b, c, d, e
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'):

        p = bioLibCG.cgPrint()                               
        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #debug
                p.show = False

                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                p.tell('ranges', range5, range3)
                p.tell('intronRange', intronPairs)
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                p.tell('utr', utr5, utr3)

                p.tell('exon before', exonPairs)
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
                p.tell('exon after', exonPairs)

                debugSpot = 23631989 

                #5UTR
                for pair in utr5:
                        p.tell('filling utr5', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_5UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_5UTR '
                
                #Exons
                for pair in exonPairs:
                        p.tell('filling exons', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_EXON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_EXON '

                #Introns
                for pair in intronPairs:
                        p.tell('filling introns', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_INTRON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_INTRON '

                #3UTR
                for pair in utr3:
                        p.tell('filling utr3', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_3UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_3UTR '

                p.show = False

        #uniqify, stringify
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])
        
        #p.tell('finalInfo', utr5, exonPairs, utr3)
        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')       
Exemple #6
0
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'):

    p = bioLibCG.cgPrint()
    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        #debug
        p.show = False

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        p.tell('ranges', range5, range3)
        p.tell('intronRange', intronPairs)
        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        p.tell('utr', utr5, utr3)

        p.tell('exon before', exonPairs)
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
        p.tell('exon after', exonPairs)

        debugSpot = 23631989

        #5UTR
        for pair in utr5:
            p.tell('filling utr5', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_5UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_5UTR '

        #Exons
        for pair in exonPairs:
            p.tell('filling exons', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_EXON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_EXON '

        #Introns
        for pair in intronPairs:
            p.tell('filling introns', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_INTRON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_INTRON '

        #3UTR
        for pair in utr3:
            p.tell('filling utr3', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_3UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_3UTR '

        p.show = False

    #uniqify, stringify
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

    #p.tell('finalInfo', utr5, exonPairs, utr3)
    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir,
                      'INTER')