Esempio n. 1
0
def makeTypeWig(tranFN, wigDir, chrom, strand, species):
    '''Using 14th column in transcripts for type info...might want to use something different?'''

    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tID = ls[0]
        tType = ls[13]
        tStart, tEnd = int(ls[3]), int(
            ls[4]
        ) - 1  #0BASE CONVERSION !!! it might have to be 0BASE for making wig...?

        tIDType = '%s:%s' % (tID, tType)
        for i in xrange(tStart, tEnd + 1):
            coord_id[i] = coord_id.get(i, '') + '%s ' % tIDType

    #unique, string
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, species, 'tType', wigDir,
                      'None')
Esempio n. 2
0
def makeTranscriptome(tranFN, outFN):

    p = bioLibCG.cgPrint()
    p.show = False
    gf = GenomeFetch.GenomeFetch('hg19')

    fOut = open(outFN, 'w')
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        tID = ls[0]
        gID = ls[10]

        seqList = []
        for eStart, eEnd in exonPairs:
            tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
            seqList.append(gf.getSequence(tcc))

        mRNA = ''.join(seqList)

        #reverse direction if negative strand
        if tStrand == '-1':
            mRNA = mRNA[::-1]

        fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
        fOut.write(mRNA + '\n\n')

    fOut.close()
    f.close()
def makeTranscriptome(tranFN, outFN):

        p = bioLibCG.cgPrint()                               
        p.show = False
        gf = GenomeFetch.GenomeFetch('hg19')
        

        fOut = open(outFN, 'w')
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                exonStarts = [int(x) + 1 for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                tID = ls[0]
                gID = ls[10]

                seqList = []
                for eStart, eEnd in exonPairs:
                        tcc = bioLibCG.makeTcc(tChrom, tStrand, eStart, eEnd)
                        seqList.append(gf.getSequence(tcc))

                mRNA = ''.join(seqList)

                #reverse direction if negative strand
                if tStrand == '-1':
                        mRNA = mRNA[::-1]

                fOut.write('> %s:%s:%s\n' % (tID, gID, len(mRNA)))
                fOut.write(mRNA + '\n\n')
                        
        fOut.close()
        f.close()
Esempio n. 4
0
def convertEnsemblBiomart(fN, outFN):

    fOut = open(outFN, 'w')
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        chrom = 'chr' + ls[1]
        strand = bioLibCG.switchStrandFormat(ls[2])
        ls[5], ls[6] = ls[4], ls[4]
        numBlocks = 1
        eStarts = ls[3] + ','
        eEnds = ls[4] + ','
        cs, ce = 'none', 'none'
        tType = ls[8] + '_noncoding'
        intType = 'None'
        gType = 'noncoding_noncoding'

        pString = [
            ls[0], chrom, strand, ls[3], ls[4], ls[5], ls[6], numBlocks,
            eStarts, eEnds, ls[7], cs, ce, tType, intType, gType
        ]
        pString = [str(x) for x in pString]
        pString = '\t'.join(pString)

        fOut.write(pString + '\n')

    f.close()
    fOut.close()
Esempio n. 5
0
def get3UTRFromTranscriptome(tranFN, outFN, wholeGene = False ):

        fOut = open(outFN, 'w')
        f = open(tranFN, 'r')
        for i, line in enumerate(f):
            ls = line.strip().split('\t')
            tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
            tStart, tEnd = int(ls[3]), int(ls[4]) - 1
            cStart, cEnd = int(ls[5]), int(ls[6]) - 1
            
            if wholeGene:
                utrTcc = bioLibCG.makeTcc(tChrom, tStrand, tStart, tEnd)
                fOut.write('%s\n' % utrTcc) 
                continue

            #5UTR
            if tStrand == '1':
                range5 = (tStart, cStart - 1)
            else:
                range5 = (cEnd + 1, tEnd)

            
            #3UTR
            if tStrand == '1':
                range3 = (cEnd + 1, tEnd)
            else:
                range3 = (tStart, cStart - 1)

            utrTcc = bioLibCG.makeTcc(tChrom, tStrand, range3[0], range3[1])
            fOut.write('%s\n' % utrTcc) 
        f.close()
        fOut.close()
Esempio n. 6
0
def makeTypeWig(tranFN, wigDir, chrom, strand, species): 
        '''Using 14th column in transcripts for type info...might want to use something different?'''

        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tID = ls[0]
                tType = ls[13] 
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1 #0BASE CONVERSION !!! it might have to be 0BASE for making wig...?

                tIDType = '%s:%s' % (tID, tType)       
                for i in xrange(tStart, tEnd + 1):
                        coord_id[i] = coord_id.get(i, '')  + '%s ' % tIDType

        #unique, string
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, species, 'tType', wigDir, 'None')       
Esempio n. 7
0
def makeGeneWig(tranFN, wigDir, chrom, strand):

    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        gID = ls[10]
        #gID = gID.replace(" ", "_")
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1

        for i in xrange(tStart, tEnd + 1):
            coord_id[i] = coord_id.get(
                i, '') + '%s$' % gID  #$ is used because of spaces

    #unique, string
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join(
            [x for x in set(ids.strip().split('$')) if x != ''])

    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, 'hg19', 'ALL', wigDir, 'None')
Esempio n. 8
0
def convertEnsemblBiomart(fN, outFN):

        fOut = open(outFN, 'w')
        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                        
                chrom = 'chr' + ls[1]
                strand = bioLibCG.switchStrandFormat(ls[2])
                ls[5], ls[6] = ls[4], ls[4]
                numBlocks = 1
                eStarts = ls[3] + ','
                eEnds = ls[4] + ','
                cs, ce = 'none', 'none'
                tType = ls[8] + '_noncoding'
                intType = 'None'
                gType = 'noncoding_noncoding'

                pString = [ls[0], chrom, strand, ls[3], ls[4], ls[5], ls[6], numBlocks, eStarts, eEnds, ls[7], cs, ce, tType, intType, gType]
                pString = [str(x) for x in pString]
                pString = '\t'.join(pString)

                fOut.write(pString + '\n')

        f.close()
        fOut.close()
Esempio n. 9
0
def getTccs(fN):

        f = open(fN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(ls[5]), ls[1], ls[2]

                print bioLibCG.makeTcc(chrom, strand, start, end)
Esempio n. 10
0
def getTccs(fN):

    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = ls[0], bioLibCG.switchStrandFormat(
            ls[5]), ls[1], ls[2]

        print bioLibCG.makeTcc(chrom, strand, start, end)
Esempio n. 11
0
def collectIDs2(fN, fN2, fN3):
    '''Used for getting the # repeat reads on target results'''

    idSet = set()
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        
        for id in ls[9].split(','):
            idSet.add(id)

    idSetDeg = set()
    f = open(fN2, 'r')
    for line in f:
        ls = line.strip().split('\t')
        
        if ls[18] == "F": continue

        if ls[0] in idSet:
            idSetDeg.add(ls[11])

    chrom_strand_coord = getHitMap(list(idSetDeg))

    #print 'target Tccs', len(idSetDeg)
    #print chrom_strand_coord['chr1']['1']

    eSet = set()
    eDict = {}
    readNames = set()
    f = open(fN3, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start = ls[2], ls[1], int(ls[3])
        strand = bioLibCG.switchStrandFormat(strand)
        for i in range(start - 3, start + 3):
            try:
                if i in chrom_strand_coord[chrom][strand]:
                    readNames.add(ls[0])
                    break
            except KeyError:
                continue
    f.close()

    #now go back through and count the times the read appears
    readName_count = {}
    f = open(fN3, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[0] in readNames:
            readName_count[ls[0]] = readName_count.get(ls[0], 0) + 1

    for read, count in readName_count.iteritems():
        print '%s\t%s' % (read, count)
Esempio n. 12
0
def collectIDs2(fN, fN2, fN3):
    '''Used for getting the # repeat reads on target results'''

    idSet = set()
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        for id in ls[9].split(','):
            idSet.add(id)

    idSetDeg = set()
    f = open(fN2, 'r')
    for line in f:
        ls = line.strip().split('\t')

        if ls[18] == "F": continue

        if ls[0] in idSet:
            idSetDeg.add(ls[11])

    chrom_strand_coord = getHitMap(list(idSetDeg))

    #print 'target Tccs', len(idSetDeg)
    #print chrom_strand_coord['chr1']['1']

    eSet = set()
    eDict = {}
    readNames = set()
    f = open(fN3, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start = ls[2], ls[1], int(ls[3])
        strand = bioLibCG.switchStrandFormat(strand)
        for i in range(start - 3, start + 3):
            try:
                if i in chrom_strand_coord[chrom][strand]:
                    readNames.add(ls[0])
                    break
            except KeyError:
                continue
    f.close()

    #now go back through and count the times the read appears
    readName_count = {}
    f = open(fN3, 'r')
    for line in f:
        ls = line.strip().split('\t')
        if ls[0] in readNames:
            readName_count[ls[0]] = readName_count.get(ls[0], 0) + 1

    for read, count in readName_count.iteritems():
        print '%s\t%s' % (read, count)
Esempio n. 13
0
def getCoords(fN, rn = None, tn = None):

        oNX = cgNexusFlat.Nexus(fN, cgOriginRNAFlat.OriginRNA)
        oNX.load(['tcc'], [rn, tn])

        for id in oNX.tcc:
                chrom, strand, start, end = bioLibCG.tccSplit(oNX.tcc[id])

                name = 'None'
                score = '0'
                strand = bioLibCG.switchStrandFormat(strand)
                thickStart = start
                thickEnd = end

                pString = [str(x) for x in [chrom, start, end, name, score, strand, thickStart, thickEnd]]
                print '\t'.join(pString)
Esempio n. 14
0
def getCoords(fN, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(fN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['tcc'], [rn, tn])

    for id in oNX.tcc:
        chrom, strand, start, end = bioLibCG.tccSplit(oNX.tcc[id])

        name = 'None'
        score = '0'
        strand = bioLibCG.switchStrandFormat(strand)
        thickStart = start
        thickEnd = end

        pString = [
            str(x) for x in
            [chrom, start, end, name, score, strand, thickStart, thickEnd]
        ]
        print '\t'.join(pString)
Esempio n. 15
0
def get20mers(aluFN):

    gf = GenomeFetch.GenomeFetch('hg19')
    seq_count = {}
    f = open(aluFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        coord = ls[0]
        chrom, start, end = coord.split(':')[0], coord.split(':')[1].split(
            '-')[0], coord.split(':')[1].split('-')[1]
        strand = bioLibCG.switchStrandFormat(ls[2])
        tcc = bioLibCG.makeTcc(chrom, strand, start, end)
        seq = gf.getSequence(tcc)
        frames = bioLibCG.returnFrames(seq, 20)
        if frames == 1:
            continue
        for smallSeq in frames:
            count = seq_count.get(smallSeq, 0)
            seq_count[smallSeq] = count + 1

    for seq, count in seq_count.items():
        print '%s\t%s' % (seq, count)
Esempio n. 16
0
def get20mers(aluFN):
        
        gf = GenomeFetch.GenomeFetch('hg19')
        seq_count = {}
        f = open(aluFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                coord = ls[0]
                chrom, start, end = coord.split(':')[0], coord.split(':')[1].split('-')[0], coord.split(':')[1].split('-')[1]
                strand = bioLibCG.switchStrandFormat(ls[2])
                tcc = bioLibCG.makeTcc(chrom, strand, start, end)
                seq = gf.getSequence(tcc)
                frames = bioLibCG.returnFrames(seq, 20)
                if frames == 1:
                        continue
                for smallSeq in frames:
                        count = seq_count.get(smallSeq, 0)
                        seq_count[smallSeq] = count + 1

        
        for seq, count in seq_count.items():
                print '%s\t%s' % (seq, count)
Esempio n. 17
0
def makeTranscriptWig(tranFN, wigDir, chrom, strand, species = 'hg19'):

        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tID = ls[0]
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1

                for i in xrange(tStart, tEnd + 1):
                        coord_id[i] = coord_id.get(i, '')  + '%s ' % tID

        #unique, string
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, species, 'transcript', wigDir, 'None')       
def updateMultipleTccs(oDir, mappedFN):

        #parse bowtie
        f = open(mappedFN, 'r')
        oID_tccs = {}
        for line in f:
                ls = line.strip().split('\t')
                oID = int(ls[0])
                strand, chrom, firstCoord = bioLibCG.switchStrandFormat(ls[1]), ls[2], int(ls[3])
                secondCoord = firstCoord + len(ls[4]) - 1
                tcc = bioLibCG.makeTcc(chrom, strand, firstCoord, secondCoord)
                
                oID_tccs.setdefault(oID, []).append(tcc)


        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for id, oRNA in id_oRNA.items():

                oRNA.tccs = oID_tccs[id]
        
        oDC.commit(id_oRNA)
Esempio n. 19
0
def updateMultipleTccs(oDir, mappedFN):

    #parse bowtie
    f = open(mappedFN, 'r')
    oID_tccs = {}
    for line in f:
        ls = line.strip().split('\t')
        oID = int(ls[0])
        strand, chrom, firstCoord = bioLibCG.switchStrandFormat(
            ls[1]), ls[2], int(ls[3])
        secondCoord = firstCoord + len(ls[4]) - 1
        tcc = bioLibCG.makeTcc(chrom, strand, firstCoord, secondCoord)

        oID_tccs.setdefault(oID, []).append(tcc)

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for id, oRNA in id_oRNA.items():

        oRNA.tccs = oID_tccs[id]

    oDC.commit(id_oRNA)
Esempio n. 20
0
def makeGeneWig(tranFN, wigDir, chrom, strand):

        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                gID = ls[10]
                #gID = gID.replace(" ", "_")
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1

                for i in xrange(tStart, tEnd + 1):
                        coord_id[i] = coord_id.get(i, '')  + '%s$' % gID #$ is used because of spaces

        #unique, string
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split('$')) if x != ''])

        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, 'hg19', 'ALL', wigDir, 'None')       
Esempio n. 21
0
def makeTranscriptWig(tranFN, wigDir, chrom, strand, species='hg19'):

    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')

        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tID = ls[0]
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1

        for i in xrange(tStart, tEnd + 1):
            coord_id[i] = coord_id.get(i, '') + '%s ' % tID

    #unique, string
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, species, 'transcript', wigDir,
                      'None')
Esempio n. 22
0
def updateTccAndSNR(mFN, alignmentFN):

    mNX = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA)
    mNX.load(['tcc', 'snrSS'])

    f = open(alignmentFN, 'r')
    i = 0
    for line in f:
        ls = line.strip().split('\t')
        chrom = ls[2]
        strand = bioLibCG.switchStrandFormat(ls[1])
        start = int(ls[3]) + 1  # 1BASE conversion
        end = start + len(ls[4])

        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        newSNR = 10.0

        mNX.tcc[i] = newTcc
        mNX.snrSS[i] = newSNR

        i += 1

    mNX.save()
Esempio n. 23
0
def checkMessy(tranFN):
        
        p = bioLibCG.cgPrint()                               
        f = open(tranFN, 'r')
        a = 0
        b = 0
        c = 0
        d = 0
        e = 0
        for line in f:
                ls = line.strip().split('\t')
                chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[15]
                tID = ls[0]

                #debug
                p.show = False

                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('5 is none')
                                b += 1
                                if codingStatus:
                                        d += 1
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('5 is none')
                                b += 1
                                if codingStatus:
                                        d += 1
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                c += 1 
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                c += 1
                                if codingStatus:
                                        e += 1
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)

                a += 1

        print a, b, c, d, e                
Esempio n. 24
0
def createCollapsedGeneSets(tranFN, outFN, acceptableTypes = 'EXON', onlyCoding = True):
    '''get areas occupied by all transcripts in a gene'''

    acceptableTypes = acceptableTypes.strip().split(',')

    geneName_intervalSet = {}
    geneName_info = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        #if tChrom != chrom or tStrand != strand:
                #continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        geneName = ls[10]
        tID = ls[0]

        #calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i -1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1
        
        #take care of messy UTRs and assign utr ranges
        #5UTR
        if tStrand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        
        #3UTR
        if tStrand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)
                        
        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)
        
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        geneName_info.setdefault(geneName, set()).add((tChrom, tStrand))
        pairs__type = [ (exonPairs, 'EXON'), (intronPairs, 'INTRON'), (utr5, '5UTR'), (utr3, '3UTR') ]

        for pairs, type in pairs__type:
            for pair in pairs:
                if type in acceptableTypes:
                   
                    if onlyCoding and not codingStatus: continue

                    #create geneset/info if does not exist
                    if geneName not in geneName_intervalSet:
                        geneName_intervalSet[geneName] = IntervalSet()

                    geneName_intervalSet[geneName].add(Interval(pair[0], pair[1] + 1))

    for geneName, info in geneName_info.iteritems():
        if len(info) > 1:
            if geneName in geneName_intervalSet:
                del geneName_intervalSet[geneName] # if it spans different chromosomes/strands...
                print geneName, info, 'FAILED'

    fOut = open(outFN, 'w')
    for geneName, iSet in geneName_intervalSet.iteritems():
        gStarts = []
        gEnds = []
        
        for interv in iSet:
            gStarts.append(interv.lower_bound)
            gEnds.append(interv.upper_bound)

        chrom, strand = geneName_info[geneName].pop()
        outString = [geneName, chrom, strand, ','.join([str(x) for x in gStarts]), ','.join([str(x) for x in gEnds])]             
        fOut.write('\t'.join([str(x) for x in outString]) + '\n')
    fOut.close()
Esempio n. 25
0
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
        '''get the number of spots in each data set, and the number that overlap'''
        '''wigDir2 has to be hela cuz strand flip'''
        maxCut = int(maxCut)

        oppStrand = bioLibCG.switchStrand(strand)
        coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
        coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

        # 0, 0, 0 = num1, num2, numOverlap
        covered = set()
        cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut))
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #calulate intron pairs
                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1


                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

                pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ]
                for pairs, type in pairs__type:
                    for pair in pairs:
                        for i in xrange(pair[0], pair[1] + 1):
                                if codingStatus:
                                    if type == 'C_EXON':
                                        if i in covered: continue #multiple transcripts will have same exons
                                        covered.add(i)
                                        val1 = coord_value1.get(i, 0)
                                        val2 = coord_value2.get(i, 0)

                                        for cut in range(1, maxCut):
                                            #in1 = (val1 >= cut)
                                            #in2 = (val2 >= cut)
                                            in1 = (val1 == cut)
                                            in2 = (val2 == cut)

                                            if in1 and in2:
                                                cutoff_overlap[cut][2] += 1
                                            
                                            if in1:
                                                cutoff_overlap[cut][0] += 1

                                            if in2:
                                                cutoff_overlap[cut][1] += 1

                                    elif type == 'C_INTRON':
                                        #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                        pass

        for i in range(1, maxCut):

            cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i])
            pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ])
            print pString
Esempio n. 26
0
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'):

        p = bioLibCG.cgPrint()                               
        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #debug
                p.show = False

                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                p.tell('ranges', range5, range3)
                p.tell('intronRange', intronPairs)
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                p.tell('utr', utr5, utr3)

                p.tell('exon before', exonPairs)
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
                p.tell('exon after', exonPairs)

                debugSpot = 23631989 

                #5UTR
                for pair in utr5:
                        p.tell('filling utr5', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_5UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_5UTR '
                
                #Exons
                for pair in exonPairs:
                        p.tell('filling exons', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_EXON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_EXON '

                #Introns
                for pair in intronPairs:
                        p.tell('filling introns', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_INTRON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_INTRON '

                #3UTR
                for pair in utr3:
                        p.tell('filling utr3', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_3UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_3UTR '

                p.show = False

        #uniqify, stringify
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])
        
        #p.tell('finalInfo', utr5, exonPairs, utr3)
        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')       
Esempio n. 27
0
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand):

    exonChr_strand_coord = {}
    intronChr_strand_coord = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        #if tChrom != chrom or tStrand != strand:
        #continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        #calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON')]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == 'C_EXON':
                            exonChr_strand_coord.setdefault(
                                tChrom, {}).setdefault(tStrand, set()).add(i)
                        elif type == 'C_INTRON':
                            #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    iLength = 0
    for chrom in intronChr_strand_coord:
        for strand in intronChr_strand_coord[chrom]:
            iLength += len(intronChr_strand_coord[chrom][strand])

    eLength = 0
    for chrom in exonChr_strand_coord:
        for strand in exonChr_strand_coord[chrom]:
            eLength += len(exonChr_strand_coord[chrom][strand])

    print 'total Exon Length (all exons overlapped)', eLength
    print 'total intron Length (all introns overlapped)', iLength
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
    """get the number of spots in each data set, and the number that overlap"""
    """wigDir2 has to be hela cuz strand flip"""
    maxCut = int(maxCut)

    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL")
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL")

    # 0, 0, 0 = num1, num2, numOverlap
    covered = set()
    cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut))
    f = open(tranFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(",")]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = "_coding" in ls[13]
        tID = ls[0]

        # calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        # take care of messy UTRs and assign utr ranges
        # 5UTR
        if strand == "1":
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        # 3UTR
        if strand == "1":
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == "C_EXON":
                            if i in covered:
                                continue  # multiple transcripts will have same exons
                            covered.add(i)
                            val1 = coord_value1.get(i, 0)
                            val2 = coord_value2.get(i, 0)

                            for cut in range(1, maxCut):
                                # in1 = (val1 >= cut)
                                # in2 = (val2 >= cut)
                                in1 = val1 == cut
                                in2 = val2 == cut

                                if in1 and in2:
                                    cutoff_overlap[cut][2] += 1

                                if in1:
                                    cutoff_overlap[cut][0] += 1

                                if in2:
                                    cutoff_overlap[cut][1] += 1

                        elif type == "C_INTRON":
                            # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    for i in range(1, maxCut):

        cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i])
        pString = "\t".join([str(x) for x in cutoff_overlap[i]])
        print pString
Esempio n. 29
0
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'):

    p = bioLibCG.cgPrint()
    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        #debug
        p.show = False

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        p.tell('ranges', range5, range3)
        p.tell('intronRange', intronPairs)
        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        p.tell('utr', utr5, utr3)

        p.tell('exon before', exonPairs)
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
        p.tell('exon after', exonPairs)

        debugSpot = 23631989

        #5UTR
        for pair in utr5:
            p.tell('filling utr5', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_5UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_5UTR '

        #Exons
        for pair in exonPairs:
            p.tell('filling exons', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_EXON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_EXON '

        #Introns
        for pair in intronPairs:
            p.tell('filling introns', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_INTRON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_INTRON '

        #3UTR
        for pair in utr3:
            p.tell('filling utr3', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_3UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_3UTR '

        p.show = False

    #uniqify, stringify
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

    #p.tell('finalInfo', utr5, exonPairs, utr3)
    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir,
                      'INTER')
Esempio n. 30
0
def makeContextDB(tranFN, chrom, strand, outFN):

    f = open(tranFN, 'r')
    fOut = open(outFN, 'w')
    id = 0
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        #5UTR
        for pair in utr5:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_5UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_5UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #Exons
        for pair in exonPairs:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_EXON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_EXON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #Introns
        for pair in intronPairs:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_INTRON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_INTRON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #3UTR
        for pair in utr3:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_3UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_3UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

    f.close()
    fOut.close()
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand):

        
        exonChr_strand_coord = {}
        intronChr_strand_coord = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                #if tChrom != chrom or tStrand != strand:
                        #continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #calulate intron pairs
                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1


                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

                pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ]
                for pairs, type in pairs__type:
                    for pair in pairs:
                        for i in xrange(pair[0], pair[1] + 1):
                                if codingStatus:
                                    if type == 'C_EXON':
                                        exonChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                    elif type == 'C_INTRON':
                                        #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                        pass

        iLength = 0
        for chrom in intronChr_strand_coord:
            for strand in intronChr_strand_coord[chrom]:
                iLength += len(intronChr_strand_coord[chrom][strand])

        eLength = 0
        for chrom in exonChr_strand_coord:
            for strand in exonChr_strand_coord[chrom]:
                eLength += len(exonChr_strand_coord[chrom][strand])

        print 'total Exon Length (all exons overlapped)', eLength                        
        print 'total intron Length (all introns overlapped)', iLength                        
Esempio n. 32
0
def checkMessy(tranFN):

    p = bioLibCG.cgPrint()
    f = open(tranFN, 'r')
    a = 0
    b = 0
    c = 0
    d = 0
    e = 0
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[15]
        tID = ls[0]

        #debug
        p.show = False

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('5 is none')
                b += 1
                if codingStatus:
                    d += 1
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('5 is none')
                b += 1
                if codingStatus:
                    d += 1
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('3 is none')
                c += 1
                if codingStatus:
                    e += 1
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('3 is none')
                c += 1
                if codingStatus:
                    e += 1
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        a += 1

    print a, b, c, d, e
def makeContextDB(tranFN, chrom, strand, outFN):
        '''outputs cID TYPE TCC, used for making wigs'''

        f = open(tranFN, 'r')
        fOut = open(outFN, 'w')
        id = 0
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]


                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                

                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])


                #5UTR
                for pair in utr5:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_5UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_5UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                
                
                #Exons
                for pair in exonPairs:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_EXON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_EXON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                

                #Introns
                for pair in intronPairs:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_INTRON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_INTRON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                

                #3UTR
                for pair in utr3:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_3UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_3UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                


        f.close()
        fOut.close()