def getSplicingUnitLengths(tranFN, wigDir, chrom, strand):

        
        exonChr_strand_coord = {}
        intronChr_strand_coord = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                #if tChrom != chrom or tStrand != strand:
                        #continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #calulate intron pairs
                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1


                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

                pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ]
                for pairs, type in pairs__type:
                    for pair in pairs:
                        for i in xrange(pair[0], pair[1] + 1):
                                if codingStatus:
                                    if type == 'C_EXON':
                                        exonChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                    elif type == 'C_INTRON':
                                        #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                        pass

        iLength = 0
        for chrom in intronChr_strand_coord:
            for strand in intronChr_strand_coord[chrom]:
                iLength += len(intronChr_strand_coord[chrom][strand])

        eLength = 0
        for chrom in exonChr_strand_coord:
            for strand in exonChr_strand_coord[chrom]:
                eLength += len(exonChr_strand_coord[chrom][strand])

        print 'total Exon Length (all exons overlapped)', eLength                        
        print 'total intron Length (all introns overlapped)', iLength                        
Beispiel #2
0
def makeContextDB(tranFN, chrom, strand, outFN):

    f = open(tranFN, 'r')
    fOut = open(outFN, 'w')
    id = 0
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        #5UTR
        for pair in utr5:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_5UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_5UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #Exons
        for pair in exonPairs:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_EXON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_EXON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #Introns
        for pair in intronPairs:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_INTRON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_INTRON', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

        #3UTR
        for pair in utr3:
            myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
            if codingStatus:
                pString = [str(id), 'C_3UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            else:
                pString = [str(id), 'NC_3UTR', myTcc]
                pString = '\t'.join(pString) + '\n'
                fOut.write(pString)
            id += 1

    f.close()
    fOut.close()
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
    """get the number of spots in each data set, and the number that overlap"""
    """wigDir2 has to be hela cuz strand flip"""
    maxCut = int(maxCut)

    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL")
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL")

    # 0, 0, 0 = num1, num2, numOverlap
    covered = set()
    cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut))
    f = open(tranFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(",")]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = "_coding" in ls[13]
        tID = ls[0]

        # calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        # take care of messy UTRs and assign utr ranges
        # 5UTR
        if strand == "1":
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        # 3UTR
        if strand == "1":
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == "C_EXON":
                            if i in covered:
                                continue  # multiple transcripts will have same exons
                            covered.add(i)
                            val1 = coord_value1.get(i, 0)
                            val2 = coord_value2.get(i, 0)

                            for cut in range(1, maxCut):
                                # in1 = (val1 >= cut)
                                # in2 = (val2 >= cut)
                                in1 = val1 == cut
                                in2 = val2 == cut

                                if in1 and in2:
                                    cutoff_overlap[cut][2] += 1

                                if in1:
                                    cutoff_overlap[cut][0] += 1

                                if in2:
                                    cutoff_overlap[cut][1] += 1

                        elif type == "C_INTRON":
                            # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    for i in range(1, maxCut):

        cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i])
        pString = "\t".join([str(x) for x in cutoff_overlap[i]])
        print pString
def makeContextDB(tranFN, chrom, strand, outFN):
        '''outputs cID TYPE TCC, used for making wigs'''

        f = open(tranFN, 'r')
        fOut = open(outFN, 'w')
        id = 0
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]


                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                

                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])


                #5UTR
                for pair in utr5:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_5UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_5UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                
                
                #Exons
                for pair in exonPairs:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_EXON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_EXON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                

                #Introns
                for pair in intronPairs:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_INTRON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_INTRON', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                

                #3UTR
                for pair in utr3:
                        myTcc = bioLibCG.makeTcc(chrom, strand, pair[0] + 1, pair[1] + 1)
                        if codingStatus:
                                pString = [str(id), 'C_3UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        else:
                                pString = [str(id), 'NC_3UTR', myTcc]
                                pString = '\t'.join(pString) + '\n'
                                fOut.write(pString)
                        id += 1                                


        f.close()
        fOut.close()
Beispiel #5
0
def getSplicingUnitLengths(tranFN, wigDir, chrom, strand):

    exonChr_strand_coord = {}
    intronChr_strand_coord = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        #if tChrom != chrom or tStrand != strand:
        #continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        #calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON')]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == 'C_EXON':
                            exonChr_strand_coord.setdefault(
                                tChrom, {}).setdefault(tStrand, set()).add(i)
                        elif type == 'C_INTRON':
                            #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    iLength = 0
    for chrom in intronChr_strand_coord:
        for strand in intronChr_strand_coord[chrom]:
            iLength += len(intronChr_strand_coord[chrom][strand])

    eLength = 0
    for chrom in exonChr_strand_coord:
        for strand in exonChr_strand_coord[chrom]:
            eLength += len(exonChr_strand_coord[chrom][strand])

    print 'total Exon Length (all exons overlapped)', eLength
    print 'total intron Length (all introns overlapped)', iLength
def makeContextWig(tranFN, wigDir, chrom, strand, species = 'hg19'):

        p = bioLibCG.cgPrint()                               
        coord_id = {}
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #debug
                p.show = False

                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1

                #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs) 
                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('5 is none')
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                p.tell('3 is none')
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                p.tell('ranges', range5, range3)
                p.tell('intronRange', intronPairs)
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                p.tell('utr', utr5, utr3)

                p.tell('exon before', exonPairs)
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
                p.tell('exon after', exonPairs)

                debugSpot = 23631989 

                #5UTR
                for pair in utr5:
                        p.tell('filling utr5', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_5UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_5UTR '
                
                #Exons
                for pair in exonPairs:
                        p.tell('filling exons', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_EXON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_EXON '

                #Introns
                for pair in intronPairs:
                        p.tell('filling introns', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_INTRON '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_INTRON '

                #3UTR
                for pair in utr3:
                        p.tell('filling utr3', pair[0], pair[1])
                        for i in xrange(pair[0], pair[1] + 1):
                                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                                if codingStatus:
                                        coord_id[i] = coord_id.get(i, '')  + 'C_3UTR '
                                else:
                                        coord_id[i] = coord_id.get(i, '')  + 'NC_3UTR '

                p.show = False

        #uniqify, stringify
        for i, ids in coord_id.iteritems():
                coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])
        
        #p.tell('finalInfo', utr5, exonPairs, utr3)
        #write wig to file                
        writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir, 'INTER')       
Beispiel #7
0
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
        '''get the number of spots in each data set, and the number that overlap'''
        '''wigDir2 has to be hela cuz strand flip'''
        maxCut = int(maxCut)

        oppStrand = bioLibCG.switchStrand(strand)
        coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
        coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

        # 0, 0, 0 = num1, num2, numOverlap
        covered = set()
        cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut))
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #calulate intron pairs
                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1


                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

                pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ]
                for pairs, type in pairs__type:
                    for pair in pairs:
                        for i in xrange(pair[0], pair[1] + 1):
                                if codingStatus:
                                    if type == 'C_EXON':
                                        if i in covered: continue #multiple transcripts will have same exons
                                        covered.add(i)
                                        val1 = coord_value1.get(i, 0)
                                        val2 = coord_value2.get(i, 0)

                                        for cut in range(1, maxCut):
                                            #in1 = (val1 >= cut)
                                            #in2 = (val2 >= cut)
                                            in1 = (val1 == cut)
                                            in2 = (val2 == cut)

                                            if in1 and in2:
                                                cutoff_overlap[cut][2] += 1
                                            
                                            if in1:
                                                cutoff_overlap[cut][0] += 1

                                            if in2:
                                                cutoff_overlap[cut][1] += 1

                                    elif type == 'C_INTRON':
                                        #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                        pass

        for i in range(1, maxCut):

            cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i])
            pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ])
            print pString
def createCollapsedGeneSets(tranFN, outFN, acceptableTypes = 'EXON', onlyCoding = True):
    '''get areas occupied by all transcripts in a gene'''

    acceptableTypes = acceptableTypes.strip().split(',')

    geneName_intervalSet = {}
    geneName_info = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        #if tChrom != chrom or tStrand != strand:
                #continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        geneName = ls[10]
        tID = ls[0]

        #calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i -1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1
        
        #take care of messy UTRs and assign utr ranges
        #5UTR
        if tStrand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        
        #3UTR
        if tStrand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)
                        
        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)
        
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        geneName_info.setdefault(geneName, set()).add((tChrom, tStrand))
        pairs__type = [ (exonPairs, 'EXON'), (intronPairs, 'INTRON'), (utr5, '5UTR'), (utr3, '3UTR') ]

        for pairs, type in pairs__type:
            for pair in pairs:
                if type in acceptableTypes:
                   
                    if onlyCoding and not codingStatus: continue

                    #create geneset/info if does not exist
                    if geneName not in geneName_intervalSet:
                        geneName_intervalSet[geneName] = IntervalSet()

                    geneName_intervalSet[geneName].add(Interval(pair[0], pair[1] + 1))

    for geneName, info in geneName_info.iteritems():
        if len(info) > 1:
            if geneName in geneName_intervalSet:
                del geneName_intervalSet[geneName] # if it spans different chromosomes/strands...
                print geneName, info, 'FAILED'

    fOut = open(outFN, 'w')
    for geneName, iSet in geneName_intervalSet.iteritems():
        gStarts = []
        gEnds = []
        
        for interv in iSet:
            gStarts.append(interv.lower_bound)
            gEnds.append(interv.upper_bound)

        chrom, strand = geneName_info[geneName].pop()
        outString = [geneName, chrom, strand, ','.join([str(x) for x in gStarts]), ','.join([str(x) for x in gEnds])]             
        fOut.write('\t'.join([str(x) for x in outString]) + '\n')
    fOut.close()
Beispiel #9
0
def makeContextWig(tranFN, wigDir, chrom, strand, species='hg19'):

    p = bioLibCG.cgPrint()
    coord_id = {}
    f = open(tranFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(',')]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = '_coding' in ls[13]
        tID = ls[0]

        #debug
        p.show = False

        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        #p.tell(tStart, tEnd, cStart, cEnd, exonPairs, intronPairs)

        #take care of messy UTRs and assign utr ranges
        #5UTR
        if strand == '1':
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('5 is none')
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        #3UTR
        if strand == '1':
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                p.tell('3 is none')
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        p.tell('ranges', range5, range3)
        p.tell('intronRange', intronPairs)
        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        p.tell('utr', utr5, utr3)

        p.tell('exon before', exonPairs)
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])
        p.tell('exon after', exonPairs)

        debugSpot = 23631989

        #5UTR
        for pair in utr5:
            p.tell('filling utr5', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** 5UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_5UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_5UTR '

        #Exons
        for pair in exonPairs:
            p.tell('filling exons', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** exon', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_EXON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_EXON '

        #Introns
        for pair in intronPairs:
            p.tell('filling introns', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell('*** INTRON', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_INTRON '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_INTRON '

        #3UTR
        for pair in utr3:
            p.tell('filling utr3', pair[0], pair[1])
            for i in xrange(pair[0], pair[1] + 1):
                if i == debugSpot: p.tell(' *** 3UTR', codingStatus, tID)
                if codingStatus:
                    coord_id[i] = coord_id.get(i, '') + 'C_3UTR '
                else:
                    coord_id[i] = coord_id.get(i, '') + 'NC_3UTR '

        p.show = False

    #uniqify, stringify
    for i, ids in coord_id.iteritems():
        coord_id[i] = ','.join([x for x in set(ids.strip().split(' '))])

    #p.tell('finalInfo', utr5, exonPairs, utr3)
    #write wig to file
    writeWigDictToWig(coord_id, chrom, strand, species, 'context', wigDir,
                      'INTER')