예제 #1
0
def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False):
    '''hela must be 2nd wigDir2 cuz strand flip'''
    strand = str(strand) #undo autocast
   
    print 'loading wigs'
    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

    print 'calculating bin values'
    f = open(geneSpanFN, 'r')
    fOut = open(outFN, 'w')
    for line in f:
        ls = line.strip().split('\t')
        sChrom, sStrand = ls[1], ls[2]
        if sChrom != chrom or sStrand != strand:
            continue
        geneName = ls[0]
        geneStarts = [int(x) for x in ls[3].split(',')]
        geneEnds = [int(x) for x in ls[4].split(',')]
        spanPairs = zip(geneStarts, geneEnds)

        frameLength = 10
        skipAmount = 2
        theSpan = fullSpanFromPairs(spanPairs)
        spanLength = len(theSpan)


        binAvgs1 = []
        binAvgs2 = []

        for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]:
            #mix up bins if simulation
            if simulation:
                newSpan = mixSpanByBin(theSpan, frameLength)
            else:
                newSpan = theSpan
            
            i = 0
            while (i+frameLength) < (spanLength+1):
                binNums = newSpan[i:(i + frameLength)]
                theBinAvg.append(binAvg(theCoord_Val, binNums))
                i = i + skipAmount

        #get rid of all 0,0 pairs for correlation 
        editPairs = zip(binAvgs1, binAvgs2)
        newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)]
        newX = [pair[0] for pair in newPairs]
        newY = [pair[1] for pair in newPairs]

        dataLoad = sum(binAvgs1) + sum(binAvgs2)
        dataLoad = float(dataLoad)/2
        pcc = pStats.pearsonr(binAvgs1, binAvgs2)
        scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2)
        outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc]  
        fOut.write('\t'.join([str(x) for x in outString]) + '\n')

    fOut.close()
    f.close()
예제 #2
0
def getPlotData(aSites, wigDir, outFN):
    '''get box plot data from sites in degradome'''

    #load and init
    spreadRange = range(-200, 201) #200 +/- ... might want to check distance each AAUAAA is from each other
    relCoord_degVals = dict( (i, []) for i in spreadRange )
    
    for chrom in bioLibCG.humanChromosomes:
        for strand in ('1', '-1'):
            print chrom, strand
            coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL')
            f = open(aSites, 'r')
            for line in f:
                ls = line.strip().split('\t')
                ichrom, istrand, start, end = bioLibCG.tccSplit(ls[0])
                if ichrom != chrom or istrand != strand: continue

                for i in spreadRange:
                    degVal = coord_value.get(end + i, 0)
                    relCoord_degVals[i].append(degVal)
            f.close()
    
    #output box data
    #each row is a histogram of spread position (e.g., first row is -200)
    f = open(outFN, 'w')
    outLines = []
    for i in spreadRange:
        l = [str(x) for x in relCoord_degVals[i]]
        outLines.append('\t'.join(l) + '\n')
    f.writelines(outLines)
    f.close()
예제 #3
0
def test(tcc, wigDir):

        chrom, strand, start, end = bioLibCG.tccSplit(tcc)
        print chrom, strand
        coord_eLevel = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL')

        sKeys = sorted(coord_eLevel.keys())

        for i in range(start, end + 1):
                print i, coord_eLevel.get(i, 0)
def updateRepeatStatus(fN, fF, wigDir, chrom, strand):

    #load oRNAs
    NX = Nexus(fN, fF)
    NX.load(['repeat', 'tcc'])
    
    #load wig file for chrom, strand
    coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT')

    while NX.nextID():
        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        if oChrom != chrom or oStrand != strand: continue

        NX.repeat = False
        for i in range(start, end + 1):
            if i in coord_value:
                NX.repeat = True
                break

    NX.save()
예제 #5
0
def updateRepeatStatus(oFN, wigDir, chrom, strand, rn=None, tn=None):

    #load oRNAs
    oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak)
    oNX.load(['repeatStatus', 'tcc'])

    #load wig file for chrom, strand
    coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT')

    for oID in oNX.repeatStatus:

        oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
        if oChrom != chrom or oStrand != strand:
            continue

        oNX.repeatStatus[oID] = False
        for i in range(start, end + 1):
            if i in coord_value:
                oNX.repeatStatus[oID] = True
                break

    oNX.save()
예제 #6
0
def updateRepeatStatus(oFN, wigDir, chrom, strand):

        #load oRNAs
        oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
        oNX.load(['repeatStatus', 'tcc'])
        
        #load wig file for chrom, strand
        coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT')

        for oID in oNX.repeatStatus:

                oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
                if oChrom != chrom or oStrand != strand:
                        continue

                oNX.repeatStatus[oID] = False
                for i in range(start, end + 1):
                        if i in coord_value:
                                oNX.repeatStatus[oID] = True
                                break


        oNX.save()
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
    """get the number of spots in each data set, and the number that overlap"""
    """wigDir2 has to be hela cuz strand flip"""
    maxCut = int(maxCut)

    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL")
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL")

    # 0, 0, 0 = num1, num2, numOverlap
    covered = set()
    cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut))
    f = open(tranFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(",")]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = "_coding" in ls[13]
        tID = ls[0]

        # calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        # take care of messy UTRs and assign utr ranges
        # 5UTR
        if strand == "1":
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        # 3UTR
        if strand == "1":
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == "C_EXON":
                            if i in covered:
                                continue  # multiple transcripts will have same exons
                            covered.add(i)
                            val1 = coord_value1.get(i, 0)
                            val2 = coord_value2.get(i, 0)

                            for cut in range(1, maxCut):
                                # in1 = (val1 >= cut)
                                # in2 = (val2 >= cut)
                                in1 = val1 == cut
                                in2 = val2 == cut

                                if in1 and in2:
                                    cutoff_overlap[cut][2] += 1

                                if in1:
                                    cutoff_overlap[cut][0] += 1

                                if in2:
                                    cutoff_overlap[cut][1] += 1

                        elif type == "C_INTRON":
                            # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    for i in range(1, maxCut):

        cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i])
        pString = "\t".join([str(x) for x in cutoff_overlap[i]])
        print pString
예제 #8
0
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
        '''get the number of spots in each data set, and the number that overlap'''
        '''wigDir2 has to be hela cuz strand flip'''
        maxCut = int(maxCut)

        oppStrand = bioLibCG.switchStrand(strand)
        coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
        coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

        # 0, 0, 0 = num1, num2, numOverlap
        covered = set()
        cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut))
        f = open(tranFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
                if tChrom != chrom or tStrand != strand:
                        continue
                tStart, tEnd = int(ls[3]), int(ls[4]) - 1
                cStart, cEnd = int(ls[5]), int(ls[6]) - 1
                exonStarts = [int(x) for x in ls[8][:-1].split(',')]
                exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')]
                exonPairs = zip(exonStarts, exonEnds)
                codingStatus = '_coding' in ls[13]
                tID = ls[0]

                #calulate intron pairs
                intronPairs = []
                i = 0
                for pair in exonPairs:
                        if i == 0:
                                i += 1
                                continue
                        iStart = exonPairs[i -1][1] + 1
                        iEnd = exonPairs[i][0] - 1
                        intronPairs.append((iStart, iEnd))
                        i += 1


                
                #take care of messy UTRs and assign utr ranges
                #5UTR
                if strand == '1':
                        if cStart == tStart or cStart == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (tStart, cStart - 1)
                else:
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range5 = ()
                        else:
                                range5 = (cEnd + 1, tEnd)

                
                #3UTR
                if strand == '1':
                        if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (cEnd + 1, tEnd)
                else:
                        if cStart == tStart or cStart == tEnd + 1:
                                range3 = ()
                        else:
                                range3 = (tStart, cStart - 1)
                                
                utr5 = compareData.subtractTwoRanges([range5], intronPairs)
                utr3 = compareData.subtractTwoRanges([range3], intronPairs)
                
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
                exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

                pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ]
                for pairs, type in pairs__type:
                    for pair in pairs:
                        for i in xrange(pair[0], pair[1] + 1):
                                if codingStatus:
                                    if type == 'C_EXON':
                                        if i in covered: continue #multiple transcripts will have same exons
                                        covered.add(i)
                                        val1 = coord_value1.get(i, 0)
                                        val2 = coord_value2.get(i, 0)

                                        for cut in range(1, maxCut):
                                            #in1 = (val1 >= cut)
                                            #in2 = (val2 >= cut)
                                            in1 = (val1 == cut)
                                            in2 = (val2 == cut)

                                            if in1 and in2:
                                                cutoff_overlap[cut][2] += 1
                                            
                                            if in1:
                                                cutoff_overlap[cut][0] += 1

                                            if in2:
                                                cutoff_overlap[cut][1] += 1

                                    elif type == 'C_INTRON':
                                        #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                                        pass

        for i in range(1, maxCut):

            cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i])
            pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ])
            print pString
예제 #9
0
def plotAllDegOverlap(inFile, chrom, strand, wigDir1, wigDir2, outDir, withIntrons = False, flipStrand = True):
    '''hela must be 2nd wigDir2 cuz strand flip'''

    oppStrand = strand
    if flipStrand:
        oppStrand = bioLibCG.switchStrand(strand)
   
    print 'loading Wigs', chrom, strand
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')


    f = open(inFile, 'r')
    for line in f:
        gName, dChrom, dStrand, exonStarts, exonEnds = line.strip().split('\t')
        if dChrom != chrom or dStrand != strand:
            continue
        exonStarts = [int(x) for x in exonStarts.split(',')]    
        exonEnds = [int(x) for x in exonEnds.split(',')]    
        print 'Plotting', gName

        #create the span info for boxplots (JUST EXONS!!!)
        exons = zip(exonStarts, exonEnds)
        introns = [(x[0] + 1, x[1] - 1) for x in zip(exonEnds[:-1], exonStarts[1:])]
        iLengths = [x[0] - x[1] + 1 for x in zip(exonStarts[1:], exonEnds[:-1])] 
        all = exons[:]
       
        if withIntrons:
            all.extend(introns) 
            
        all.sort()
        tSpan = [('exon', x) if x in exons else ('intron', x) for x in all] 


        #gather expression data
        c_v = {}
        c_v2 = {}
        for type, (eStart, eEnd) in tSpan:
            for i in range(eStart, eEnd + 1):
                if i in coord_value1:
                    c_v[i] = coord_value1[i]
                if i in coord_value2:
                    c_v2[i] = coord_value2[i]

        #intron displacement for ONLY EXONS
        if not withIntrons:
            iCumulativeLengths = [sum(iLengths[:x]) for x in range(1,len(introns) + 1)]
            for i, (eStart, eEnd) in enumerate(exons):
                if i == 0: continue
                dAmount = iCumulativeLengths[i - 1] 
                for j in range(eStart, eEnd + 1):
                    if j in c_v:
                        c_v[j - dAmount] = c_v[j]
                        del c_v[j]
                    if j in c_v2:
                        c_v2[j - dAmount] = c_v2[j]
                        del c_v2[j]

        #get overall max
        overMax = max([max(x) for x in [c_v.values(), c_v2.values()]])
        a, b = set(c_v.keys()), set(c_v2.keys())
        overlap = a.intersection(b)
        colors_a = ['r' if x in overlap else 'k' for x in sorted(a)]
        colors_b = ['r' if x in overlap else 'k' for x in sorted(b)]

        plotGrassTrack(c_v, [9, 15], manualMax = overMax, flip = False, colors = colors_a)
        plotGrassTrack(c_v2, [-3, 3], manualMax = overMax,flip = True, colors = colors_b)
        xStart = plotGeneTrack(tSpan, 0)

        #labels and axes
        plt.figtext(.05, .5, gName)
        plt.figtext(.05, .62, '0 -')
        plt.figtext(.05, .89, '%s -' % overMax)
        plt.figtext(.05, 1 - .62, '0 -')
        plt.figtext(.05, 1 - .89, '%s -' % overMax)
        plt.ylim(-3,15)
        frame1 = plt.gca()
        frame1.axes.get_yaxis().set_visible(False)
        if dStrand == '1':
            plt.title('Degradome Comparison (5-->3)')
        else:
            plt.title('Degradome Comparison (3-->5)')
        imgName = outDir + '/' + gName + '.degOverlapPlot.png'
        plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
        #plt.show()
        plt.close('all')
    
    f.close()