def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False):
    '''hela must be 2nd wigDir2 cuz strand flip'''
    strand = str(strand) #undo autocast
   
    print 'loading wigs'
    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')

    print 'calculating bin values'
    f = open(geneSpanFN, 'r')
    fOut = open(outFN, 'w')
    for line in f:
        ls = line.strip().split('\t')
        sChrom, sStrand = ls[1], ls[2]
        if sChrom != chrom or sStrand != strand:
            continue
        geneName = ls[0]
        geneStarts = [int(x) for x in ls[3].split(',')]
        geneEnds = [int(x) for x in ls[4].split(',')]
        spanPairs = zip(geneStarts, geneEnds)

        frameLength = 10
        skipAmount = 2
        theSpan = fullSpanFromPairs(spanPairs)
        spanLength = len(theSpan)


        binAvgs1 = []
        binAvgs2 = []

        for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]:
            #mix up bins if simulation
            if simulation:
                newSpan = mixSpanByBin(theSpan, frameLength)
            else:
                newSpan = theSpan
            
            i = 0
            while (i+frameLength) < (spanLength+1):
                binNums = newSpan[i:(i + frameLength)]
                theBinAvg.append(binAvg(theCoord_Val, binNums))
                i = i + skipAmount

        #get rid of all 0,0 pairs for correlation 
        editPairs = zip(binAvgs1, binAvgs2)
        newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)]
        newX = [pair[0] for pair in newPairs]
        newY = [pair[1] for pair in newPairs]

        dataLoad = sum(binAvgs1) + sum(binAvgs2)
        dataLoad = float(dataLoad)/2
        pcc = pStats.pearsonr(binAvgs1, binAvgs2)
        scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2)
        outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc]  
        fOut.write('\t'.join([str(x) for x in outString]) + '\n')

    fOut.close()
    f.close()
예제 #2
0
def makeHitMapDegPeak(dFN, switchStrand = False):

    NX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
    NX.load(['tcc', 'eLevel'])

    c_s_coord = {}

    for id in NX.ids:

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc[id])
        if switchStrand:
            strand = bioLibCG.switchStrand(strand)

        for i in range(start, end + 1):
            c_s_coord.setdefault(chrom, {}).setdefault(strand, set()).add(i)

    return c_s_coord
예제 #3
0
def getBiasedSeqs(fN, assembly, switchStrand = True):

    seqs = []
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
        if switchStrand:
            strand = bioLibCG.switchStrand(strand)
        start -= 10
        end += 10 
        seqs.append(bioLibCG.makeTcc(chrom,strand,start,end))
    f.close()

    myG = gf.GenomeFetch(assembly)
    sequences = []
    for i, seq in enumerate(seqs):
        sequences.append(myG.getSequence(seq))
        print '>blah_%s' % i 
        print sequences[-1]

    for let, count in getSeqEnrichment(sequences).items():
        print let, count 
예제 #4
0
def getBiasedSeqs(fN, assembly, switchStrand=True):

    seqs = []
    f = open(fN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        chrom, strand, start, end = bioLibCG.tccSplit(ls[0])
        if switchStrand:
            strand = bioLibCG.switchStrand(strand)
        start -= 10
        end += 10
        seqs.append(bioLibCG.makeTcc(chrom, strand, start, end))
    f.close()

    myG = gf.GenomeFetch(assembly)
    sequences = []
    for i, seq in enumerate(seqs):
        sequences.append(myG.getSequence(seq))
        print '>blah_%s' % i
        print sequences[-1]

    for let, count in getSeqEnrichment(sequences).items():
        print let, count
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut):
    """get the number of spots in each data set, and the number that overlap"""
    """wigDir2 has to be hela cuz strand flip"""
    maxCut = int(maxCut)

    oppStrand = bioLibCG.switchStrand(strand)
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL")
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL")

    # 0, 0, 0 = num1, num2, numOverlap
    covered = set()
    cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut))
    f = open(tranFN, "r")
    for line in f:
        ls = line.strip().split("\t")
        tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2])
        if tChrom != chrom or tStrand != strand:
            continue
        tStart, tEnd = int(ls[3]), int(ls[4]) - 1
        cStart, cEnd = int(ls[5]), int(ls[6]) - 1
        exonStarts = [int(x) for x in ls[8][:-1].split(",")]
        exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")]
        exonPairs = zip(exonStarts, exonEnds)
        codingStatus = "_coding" in ls[13]
        tID = ls[0]

        # calulate intron pairs
        intronPairs = []
        i = 0
        for pair in exonPairs:
            if i == 0:
                i += 1
                continue
            iStart = exonPairs[i - 1][1] + 1
            iEnd = exonPairs[i][0] - 1
            intronPairs.append((iStart, iEnd))
            i += 1

        # take care of messy UTRs and assign utr ranges
        # 5UTR
        if strand == "1":
            if cStart == tStart or cStart == tEnd + 1:
                range5 = ()
            else:
                range5 = (tStart, cStart - 1)
        else:
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range5 = ()
            else:
                range5 = (cEnd + 1, tEnd)

        # 3UTR
        if strand == "1":
            if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1:
                range3 = ()
            else:
                range3 = (cEnd + 1, tEnd)
        else:
            if cStart == tStart or cStart == tEnd + 1:
                range3 = ()
            else:
                range3 = (tStart, cStart - 1)

        utr5 = compareData.subtractTwoRanges([range5], intronPairs)
        utr3 = compareData.subtractTwoRanges([range3], intronPairs)

        exonPairs = compareData.subtractTwoRanges(exonPairs, [range5])
        exonPairs = compareData.subtractTwoRanges(exonPairs, [range3])

        pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")]
        for pairs, type in pairs__type:
            for pair in pairs:
                for i in xrange(pair[0], pair[1] + 1):
                    if codingStatus:
                        if type == "C_EXON":
                            if i in covered:
                                continue  # multiple transcripts will have same exons
                            covered.add(i)
                            val1 = coord_value1.get(i, 0)
                            val2 = coord_value2.get(i, 0)

                            for cut in range(1, maxCut):
                                # in1 = (val1 >= cut)
                                # in2 = (val2 >= cut)
                                in1 = val1 == cut
                                in2 = val2 == cut

                                if in1 and in2:
                                    cutoff_overlap[cut][2] += 1

                                if in1:
                                    cutoff_overlap[cut][0] += 1

                                if in2:
                                    cutoff_overlap[cut][1] += 1

                        elif type == "C_INTRON":
                            # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i)
                            pass

    for i in range(1, maxCut):

        cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i])
        pString = "\t".join([str(x) for x in cutoff_overlap[i]])
        print pString
예제 #6
0
def plotAllDegOverlap(inFile, chrom, strand, wigDir1, wigDir2, outDir, withIntrons = False, flipStrand = True):
    '''hela must be 2nd wigDir2 cuz strand flip'''

    oppStrand = strand
    if flipStrand:
        oppStrand = bioLibCG.switchStrand(strand)
   
    print 'loading Wigs', chrom, strand
    coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL')
    coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL')


    f = open(inFile, 'r')
    for line in f:
        gName, dChrom, dStrand, exonStarts, exonEnds = line.strip().split('\t')
        if dChrom != chrom or dStrand != strand:
            continue
        exonStarts = [int(x) for x in exonStarts.split(',')]    
        exonEnds = [int(x) for x in exonEnds.split(',')]    
        print 'Plotting', gName

        #create the span info for boxplots (JUST EXONS!!!)
        exons = zip(exonStarts, exonEnds)
        introns = [(x[0] + 1, x[1] - 1) for x in zip(exonEnds[:-1], exonStarts[1:])]
        iLengths = [x[0] - x[1] + 1 for x in zip(exonStarts[1:], exonEnds[:-1])] 
        all = exons[:]
       
        if withIntrons:
            all.extend(introns) 
            
        all.sort()
        tSpan = [('exon', x) if x in exons else ('intron', x) for x in all] 


        #gather expression data
        c_v = {}
        c_v2 = {}
        for type, (eStart, eEnd) in tSpan:
            for i in range(eStart, eEnd + 1):
                if i in coord_value1:
                    c_v[i] = coord_value1[i]
                if i in coord_value2:
                    c_v2[i] = coord_value2[i]

        #intron displacement for ONLY EXONS
        if not withIntrons:
            iCumulativeLengths = [sum(iLengths[:x]) for x in range(1,len(introns) + 1)]
            for i, (eStart, eEnd) in enumerate(exons):
                if i == 0: continue
                dAmount = iCumulativeLengths[i - 1] 
                for j in range(eStart, eEnd + 1):
                    if j in c_v:
                        c_v[j - dAmount] = c_v[j]
                        del c_v[j]
                    if j in c_v2:
                        c_v2[j - dAmount] = c_v2[j]
                        del c_v2[j]

        #get overall max
        overMax = max([max(x) for x in [c_v.values(), c_v2.values()]])
        a, b = set(c_v.keys()), set(c_v2.keys())
        overlap = a.intersection(b)
        colors_a = ['r' if x in overlap else 'k' for x in sorted(a)]
        colors_b = ['r' if x in overlap else 'k' for x in sorted(b)]

        plotGrassTrack(c_v, [9, 15], manualMax = overMax, flip = False, colors = colors_a)
        plotGrassTrack(c_v2, [-3, 3], manualMax = overMax,flip = True, colors = colors_b)
        xStart = plotGeneTrack(tSpan, 0)

        #labels and axes
        plt.figtext(.05, .5, gName)
        plt.figtext(.05, .62, '0 -')
        plt.figtext(.05, .89, '%s -' % overMax)
        plt.figtext(.05, 1 - .62, '0 -')
        plt.figtext(.05, 1 - .89, '%s -' % overMax)
        plt.ylim(-3,15)
        frame1 = plt.gca()
        frame1.axes.get_yaxis().set_visible(False)
        if dStrand == '1':
            plt.title('Degradome Comparison (5-->3)')
        else:
            plt.title('Degradome Comparison (3-->5)')
        imgName = outDir + '/' + gName + '.degOverlapPlot.png'
        plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
        #plt.show()
        plt.close('all')
    
    f.close()