def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False): '''hela must be 2nd wigDir2 cuz strand flip''' strand = str(strand) #undo autocast print 'loading wigs' oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') print 'calculating bin values' f = open(geneSpanFN, 'r') fOut = open(outFN, 'w') for line in f: ls = line.strip().split('\t') sChrom, sStrand = ls[1], ls[2] if sChrom != chrom or sStrand != strand: continue geneName = ls[0] geneStarts = [int(x) for x in ls[3].split(',')] geneEnds = [int(x) for x in ls[4].split(',')] spanPairs = zip(geneStarts, geneEnds) frameLength = 10 skipAmount = 2 theSpan = fullSpanFromPairs(spanPairs) spanLength = len(theSpan) binAvgs1 = [] binAvgs2 = [] for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]: #mix up bins if simulation if simulation: newSpan = mixSpanByBin(theSpan, frameLength) else: newSpan = theSpan i = 0 while (i+frameLength) < (spanLength+1): binNums = newSpan[i:(i + frameLength)] theBinAvg.append(binAvg(theCoord_Val, binNums)) i = i + skipAmount #get rid of all 0,0 pairs for correlation editPairs = zip(binAvgs1, binAvgs2) newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)] newX = [pair[0] for pair in newPairs] newY = [pair[1] for pair in newPairs] dataLoad = sum(binAvgs1) + sum(binAvgs2) dataLoad = float(dataLoad)/2 pcc = pStats.pearsonr(binAvgs1, binAvgs2) scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2) outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc] fOut.write('\t'.join([str(x) for x in outString]) + '\n') fOut.close() f.close()
def getPlotData(aSites, wigDir, outFN): '''get box plot data from sites in degradome''' #load and init spreadRange = range(-200, 201) #200 +/- ... might want to check distance each AAUAAA is from each other relCoord_degVals = dict( (i, []) for i in spreadRange ) for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): print chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL') f = open(aSites, 'r') for line in f: ls = line.strip().split('\t') ichrom, istrand, start, end = bioLibCG.tccSplit(ls[0]) if ichrom != chrom or istrand != strand: continue for i in spreadRange: degVal = coord_value.get(end + i, 0) relCoord_degVals[i].append(degVal) f.close() #output box data #each row is a histogram of spread position (e.g., first row is -200) f = open(outFN, 'w') outLines = [] for i in spreadRange: l = [str(x) for x in relCoord_degVals[i]] outLines.append('\t'.join(l) + '\n') f.writelines(outLines) f.close()
def test(tcc, wigDir): chrom, strand, start, end = bioLibCG.tccSplit(tcc) print chrom, strand coord_eLevel = cgWig.loadSingleWig(wigDir, chrom, strand, 'ALL') sKeys = sorted(coord_eLevel.keys()) for i in range(start, end + 1): print i, coord_eLevel.get(i, 0)
def updateRepeatStatus(fN, fF, wigDir, chrom, strand): #load oRNAs NX = Nexus(fN, fF) NX.load(['repeat', 'tcc']) #load wig file for chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT') while NX.nextID(): oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc) if oChrom != chrom or oStrand != strand: continue NX.repeat = False for i in range(start, end + 1): if i in coord_value: NX.repeat = True break NX.save()
def updateRepeatStatus(oFN, wigDir, chrom, strand, rn=None, tn=None): #load oRNAs oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak) oNX.load(['repeatStatus', 'tcc']) #load wig file for chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT') for oID in oNX.repeatStatus: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) if oChrom != chrom or oStrand != strand: continue oNX.repeatStatus[oID] = False for i in range(start, end + 1): if i in coord_value: oNX.repeatStatus[oID] = True break oNX.save()
def updateRepeatStatus(oFN, wigDir, chrom, strand): #load oRNAs oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak) oNX.load(['repeatStatus', 'tcc']) #load wig file for chrom, strand coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT') for oID in oNX.repeatStatus: oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID]) if oChrom != chrom or oStrand != strand: continue oNX.repeatStatus[oID] = False for i in range(start, end + 1): if i in coord_value: oNX.repeatStatus[oID] = True break oNX.save()
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): """get the number of spots in each data set, and the number that overlap""" """wigDir2 has to be hela cuz strand flip""" maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL") coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL") # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, "r") for line in f: ls = line.strip().split("\t") tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(",")] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")] exonPairs = zip(exonStarts, exonEnds) codingStatus = "_coding" in ls[13] tID = ls[0] # calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 # take care of messy UTRs and assign utr ranges # 5UTR if strand == "1": if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) # 3UTR if strand == "1": if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == "C_EXON": if i in covered: continue # multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): # in1 = (val1 >= cut) # in2 = (val2 >= cut) in1 = val1 == cut in2 = val2 == cut if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == "C_INTRON": # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i]) pString = "\t".join([str(x) for x in cutoff_overlap[i]]) print pString
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): '''get the number of spots in each data set, and the number that overlap''' '''wigDir2 has to be hela cuz strand flip''' maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict( (i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, 'r') for line in f: ls = line.strip().split('\t') tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(',')] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(',')] exonPairs = zip(exonStarts, exonEnds) codingStatus = '_coding' in ls[13] tID = ls[0] #calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i -1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 #take care of messy UTRs and assign utr ranges #5UTR if strand == '1': if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) #3UTR if strand == '1': if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [ (exonPairs, 'C_EXON'), (intronPairs, 'C_INTRON') ] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == 'C_EXON': if i in covered: continue #multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): #in1 = (val1 >= cut) #in2 = (val2 >= cut) in1 = (val1 == cut) in2 = (val2 == cut) if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == 'C_INTRON': #intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(['%s:%s' % (chrom, strand), i]) pString = '\t'.join([ str(x) for x in cutoff_overlap[i] ]) print pString
def plotAllDegOverlap(inFile, chrom, strand, wigDir1, wigDir2, outDir, withIntrons = False, flipStrand = True): '''hela must be 2nd wigDir2 cuz strand flip''' oppStrand = strand if flipStrand: oppStrand = bioLibCG.switchStrand(strand) print 'loading Wigs', chrom, strand coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') f = open(inFile, 'r') for line in f: gName, dChrom, dStrand, exonStarts, exonEnds = line.strip().split('\t') if dChrom != chrom or dStrand != strand: continue exonStarts = [int(x) for x in exonStarts.split(',')] exonEnds = [int(x) for x in exonEnds.split(',')] print 'Plotting', gName #create the span info for boxplots (JUST EXONS!!!) exons = zip(exonStarts, exonEnds) introns = [(x[0] + 1, x[1] - 1) for x in zip(exonEnds[:-1], exonStarts[1:])] iLengths = [x[0] - x[1] + 1 for x in zip(exonStarts[1:], exonEnds[:-1])] all = exons[:] if withIntrons: all.extend(introns) all.sort() tSpan = [('exon', x) if x in exons else ('intron', x) for x in all] #gather expression data c_v = {} c_v2 = {} for type, (eStart, eEnd) in tSpan: for i in range(eStart, eEnd + 1): if i in coord_value1: c_v[i] = coord_value1[i] if i in coord_value2: c_v2[i] = coord_value2[i] #intron displacement for ONLY EXONS if not withIntrons: iCumulativeLengths = [sum(iLengths[:x]) for x in range(1,len(introns) + 1)] for i, (eStart, eEnd) in enumerate(exons): if i == 0: continue dAmount = iCumulativeLengths[i - 1] for j in range(eStart, eEnd + 1): if j in c_v: c_v[j - dAmount] = c_v[j] del c_v[j] if j in c_v2: c_v2[j - dAmount] = c_v2[j] del c_v2[j] #get overall max overMax = max([max(x) for x in [c_v.values(), c_v2.values()]]) a, b = set(c_v.keys()), set(c_v2.keys()) overlap = a.intersection(b) colors_a = ['r' if x in overlap else 'k' for x in sorted(a)] colors_b = ['r' if x in overlap else 'k' for x in sorted(b)] plotGrassTrack(c_v, [9, 15], manualMax = overMax, flip = False, colors = colors_a) plotGrassTrack(c_v2, [-3, 3], manualMax = overMax,flip = True, colors = colors_b) xStart = plotGeneTrack(tSpan, 0) #labels and axes plt.figtext(.05, .5, gName) plt.figtext(.05, .62, '0 -') plt.figtext(.05, .89, '%s -' % overMax) plt.figtext(.05, 1 - .62, '0 -') plt.figtext(.05, 1 - .89, '%s -' % overMax) plt.ylim(-3,15) frame1 = plt.gca() frame1.axes.get_yaxis().set_visible(False) if dStrand == '1': plt.title('Degradome Comparison (5-->3)') else: plt.title('Degradome Comparison (3-->5)') imgName = outDir + '/' + gName + '.degOverlapPlot.png' plt.savefig(imgName, bbox_inches='tight', pad_inches=1) #plt.show() plt.close('all') f.close()