def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False): '''hela must be 2nd wigDir2 cuz strand flip''' strand = str(strand) #undo autocast print 'loading wigs' oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') print 'calculating bin values' f = open(geneSpanFN, 'r') fOut = open(outFN, 'w') for line in f: ls = line.strip().split('\t') sChrom, sStrand = ls[1], ls[2] if sChrom != chrom or sStrand != strand: continue geneName = ls[0] geneStarts = [int(x) for x in ls[3].split(',')] geneEnds = [int(x) for x in ls[4].split(',')] spanPairs = zip(geneStarts, geneEnds) frameLength = 10 skipAmount = 2 theSpan = fullSpanFromPairs(spanPairs) spanLength = len(theSpan) binAvgs1 = [] binAvgs2 = [] for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]: #mix up bins if simulation if simulation: newSpan = mixSpanByBin(theSpan, frameLength) else: newSpan = theSpan i = 0 while (i+frameLength) < (spanLength+1): binNums = newSpan[i:(i + frameLength)] theBinAvg.append(binAvg(theCoord_Val, binNums)) i = i + skipAmount #get rid of all 0,0 pairs for correlation editPairs = zip(binAvgs1, binAvgs2) newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)] newX = [pair[0] for pair in newPairs] newY = [pair[1] for pair in newPairs] dataLoad = sum(binAvgs1) + sum(binAvgs2) dataLoad = float(dataLoad)/2 pcc = pStats.pearsonr(binAvgs1, binAvgs2) scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2) outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc] fOut.write('\t'.join([str(x) for x in outString]) + '\n') fOut.close() f.close()
def makeHitMapDegPeak(dFN, switchStrand = False): NX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) NX.load(['tcc', 'eLevel']) c_s_coord = {} for id in NX.ids: chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc[id]) if switchStrand: strand = bioLibCG.switchStrand(strand) for i in range(start, end + 1): c_s_coord.setdefault(chrom, {}).setdefault(strand, set()).add(i) return c_s_coord
def getBiasedSeqs(fN, assembly, switchStrand = True): seqs = [] f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) if switchStrand: strand = bioLibCG.switchStrand(strand) start -= 10 end += 10 seqs.append(bioLibCG.makeTcc(chrom,strand,start,end)) f.close() myG = gf.GenomeFetch(assembly) sequences = [] for i, seq in enumerate(seqs): sequences.append(myG.getSequence(seq)) print '>blah_%s' % i print sequences[-1] for let, count in getSeqEnrichment(sequences).items(): print let, count
def getBiasedSeqs(fN, assembly, switchStrand=True): seqs = [] f = open(fN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[0]) if switchStrand: strand = bioLibCG.switchStrand(strand) start -= 10 end += 10 seqs.append(bioLibCG.makeTcc(chrom, strand, start, end)) f.close() myG = gf.GenomeFetch(assembly) sequences = [] for i, seq in enumerate(seqs): sequences.append(myG.getSequence(seq)) print '>blah_%s' % i print sequences[-1] for let, count in getSeqEnrichment(sequences).items(): print let, count
def getSplicingUnitOccupancy(tranFN, wigDir1, wigDir2, chrom, strand, maxCut): """get the number of spots in each data set, and the number that overlap""" """wigDir2 has to be hela cuz strand flip""" maxCut = int(maxCut) oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, "ALL") coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, "ALL") # 0, 0, 0 = num1, num2, numOverlap covered = set() cutoff_overlap = dict((i, [0, 0, 0]) for i in range(maxCut)) f = open(tranFN, "r") for line in f: ls = line.strip().split("\t") tChrom, tStrand = ls[1], bioLibCG.switchStrandFormat(ls[2]) if tChrom != chrom or tStrand != strand: continue tStart, tEnd = int(ls[3]), int(ls[4]) - 1 cStart, cEnd = int(ls[5]), int(ls[6]) - 1 exonStarts = [int(x) for x in ls[8][:-1].split(",")] exonEnds = [int(x) - 1 for x in ls[9][:-1].split(",")] exonPairs = zip(exonStarts, exonEnds) codingStatus = "_coding" in ls[13] tID = ls[0] # calulate intron pairs intronPairs = [] i = 0 for pair in exonPairs: if i == 0: i += 1 continue iStart = exonPairs[i - 1][1] + 1 iEnd = exonPairs[i][0] - 1 intronPairs.append((iStart, iEnd)) i += 1 # take care of messy UTRs and assign utr ranges # 5UTR if strand == "1": if cStart == tStart or cStart == tEnd + 1: range5 = () else: range5 = (tStart, cStart - 1) else: if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range5 = () else: range5 = (cEnd + 1, tEnd) # 3UTR if strand == "1": if cEnd + 1 == tStart or cEnd + 1 == tEnd + 1: range3 = () else: range3 = (cEnd + 1, tEnd) else: if cStart == tStart or cStart == tEnd + 1: range3 = () else: range3 = (tStart, cStart - 1) utr5 = compareData.subtractTwoRanges([range5], intronPairs) utr3 = compareData.subtractTwoRanges([range3], intronPairs) exonPairs = compareData.subtractTwoRanges(exonPairs, [range5]) exonPairs = compareData.subtractTwoRanges(exonPairs, [range3]) pairs__type = [(exonPairs, "C_EXON"), (intronPairs, "C_INTRON")] for pairs, type in pairs__type: for pair in pairs: for i in xrange(pair[0], pair[1] + 1): if codingStatus: if type == "C_EXON": if i in covered: continue # multiple transcripts will have same exons covered.add(i) val1 = coord_value1.get(i, 0) val2 = coord_value2.get(i, 0) for cut in range(1, maxCut): # in1 = (val1 >= cut) # in2 = (val2 >= cut) in1 = val1 == cut in2 = val2 == cut if in1 and in2: cutoff_overlap[cut][2] += 1 if in1: cutoff_overlap[cut][0] += 1 if in2: cutoff_overlap[cut][1] += 1 elif type == "C_INTRON": # intronChr_strand_coord.setdefault(tChrom, {}).setdefault(tStrand, set()).add(i) pass for i in range(1, maxCut): cutoff_overlap[i].extend(["%s:%s" % (chrom, strand), i]) pString = "\t".join([str(x) for x in cutoff_overlap[i]]) print pString
def plotAllDegOverlap(inFile, chrom, strand, wigDir1, wigDir2, outDir, withIntrons = False, flipStrand = True): '''hela must be 2nd wigDir2 cuz strand flip''' oppStrand = strand if flipStrand: oppStrand = bioLibCG.switchStrand(strand) print 'loading Wigs', chrom, strand coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') f = open(inFile, 'r') for line in f: gName, dChrom, dStrand, exonStarts, exonEnds = line.strip().split('\t') if dChrom != chrom or dStrand != strand: continue exonStarts = [int(x) for x in exonStarts.split(',')] exonEnds = [int(x) for x in exonEnds.split(',')] print 'Plotting', gName #create the span info for boxplots (JUST EXONS!!!) exons = zip(exonStarts, exonEnds) introns = [(x[0] + 1, x[1] - 1) for x in zip(exonEnds[:-1], exonStarts[1:])] iLengths = [x[0] - x[1] + 1 for x in zip(exonStarts[1:], exonEnds[:-1])] all = exons[:] if withIntrons: all.extend(introns) all.sort() tSpan = [('exon', x) if x in exons else ('intron', x) for x in all] #gather expression data c_v = {} c_v2 = {} for type, (eStart, eEnd) in tSpan: for i in range(eStart, eEnd + 1): if i in coord_value1: c_v[i] = coord_value1[i] if i in coord_value2: c_v2[i] = coord_value2[i] #intron displacement for ONLY EXONS if not withIntrons: iCumulativeLengths = [sum(iLengths[:x]) for x in range(1,len(introns) + 1)] for i, (eStart, eEnd) in enumerate(exons): if i == 0: continue dAmount = iCumulativeLengths[i - 1] for j in range(eStart, eEnd + 1): if j in c_v: c_v[j - dAmount] = c_v[j] del c_v[j] if j in c_v2: c_v2[j - dAmount] = c_v2[j] del c_v2[j] #get overall max overMax = max([max(x) for x in [c_v.values(), c_v2.values()]]) a, b = set(c_v.keys()), set(c_v2.keys()) overlap = a.intersection(b) colors_a = ['r' if x in overlap else 'k' for x in sorted(a)] colors_b = ['r' if x in overlap else 'k' for x in sorted(b)] plotGrassTrack(c_v, [9, 15], manualMax = overMax, flip = False, colors = colors_a) plotGrassTrack(c_v2, [-3, 3], manualMax = overMax,flip = True, colors = colors_b) xStart = plotGeneTrack(tSpan, 0) #labels and axes plt.figtext(.05, .5, gName) plt.figtext(.05, .62, '0 -') plt.figtext(.05, .89, '%s -' % overMax) plt.figtext(.05, 1 - .62, '0 -') plt.figtext(.05, 1 - .89, '%s -' % overMax) plt.ylim(-3,15) frame1 = plt.gca() frame1.axes.get_yaxis().set_visible(False) if dStrand == '1': plt.title('Degradome Comparison (5-->3)') else: plt.title('Degradome Comparison (3-->5)') imgName = outDir + '/' + gName + '.degOverlapPlot.png' plt.savefig(imgName, bbox_inches='tight', pad_inches=1) #plt.show() plt.close('all') f.close()