def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) k_nb = len(fBiasDict.keys()[0]) p1 = start; p2 = end p1_w = p1 - (window/2); p2_w = p2 + (window/2) p1_wk = p1_w - (k_nb/2); p2_wk = p2_w + (k_nb/2) # Raw counts nf = [0.0] * (p2_w-p1_w); nr = [0.0] * (p2_w-p1_w) for r in bam.fetch(chrName, p1_w, p2_w): if((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos-p1_w] += 1.0 if((r.is_reverse) and ((r.aend-1) < p2_w)): nr[r.aend-1-p1_w] += 1.0 # Smoothed counts Nf = []; Nr = []; fSum = sum(nf[:window]); rSum = sum(nr[:window]); fLast = nf[0]; rLast = nr[0] for i in range((window/2),len(nf)-(window/2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast; fSum += nf[i+(window/2)]; fLast = nf[i-(window/2)+1] rSum -= rLast; rSum += nr[i+(window/2)]; rLast = nr[i-(window/2)+1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) # Iterating on sequence to create signal af = []; ar = [] for i in range((k_nb/2),len(currStr)-(k_nb/2)+1): fseq = currStr[i-(k_nb/2):i+(k_nb/2)] rseq = currRevComp[len(currStr)-(k_nb/2)-i:len(currStr)+(k_nb/2)-i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] bias_corrected_signal = [] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) bias_corrected_signal.append(zf+zr) fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] # Termination fastaFile.close() return bias_corrected_signal
outputFile.write("fixedStep chrom="+chrName+" start="+str(p1+1)+" step=1\n") fSum = sum(af[:window]); rSum = sum(ar[:window]); fLast = af[0]; rLast = ar[0] for i in range((window/2),len(af)-(window/2)): nhatf = Nf[i-(window/2)]*(af[i]/fSum) nhatr = Nr[i-(window/2)]*(ar[i]/rSum) zf = log(nf[i]+1)-log(nhatf+1) zr = log(nr[i]+1)-log(nhatr+1) outputFile.write(str(round(zf+zr,4))+"\n") #print i+p1+1-(window/2), af[i], ar[i], fSum, rSum, Nf[i-(window/2)], Nr[i-(window/2)] fSum -= fLast; fSum += af[i+(window/2)]; fLast = af[i-(window/2)+1] rSum -= rLast; rSum += ar[i+(window/2)]; rLast = ar[i-(window/2)+1] #for i in range(p1, p2): # print i+1, z[i-p1] except Exception: continue # Closing files bamFile.close() fastaFile.close() coordFile.close() outputFile.close() # Converting to bigwig os.system(" ".join(["wigToBigWig",outputFileName,csFileName,outputFileName[:-3]+"bw"])) os.system(" ".join(["wigToBigWig",outputFileNameRaw,csFileName,outputFileNameRaw[:-3]+"bw"])) #os.system(" ".join(["rm",outputFileName]))
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + (window / 2)] f_last = nf[i - (window / 2) + 1] r_sum -= r_last r_sum += nr[i + (window / 2)] r_last = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / f_sum) nhatr = Nr[i - (window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + (window / 2)] f_last = af[i - (window / 2) + 1] r_sum -= r_last r_sum += ar[i + (window / 2)] r_last = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bc_signal
def bias_correction(bam, signal, fBiasDict, rBiasDict, genome_file_name, chrName, start, end): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - (k_nb / 2) p2_wk = p2_w + (k_nb / 2) # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for r in bam.fetch(chrName, p1_w, p2_w): if ((not r.is_reverse) and (r.pos > p1_w)): nf[r.pos - p1_w] += 1.0 if ((r.is_reverse) and ((r.aend - 1) < p2_w)): nr[r.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = revcomp( str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range((k_nb / 2), len(currStr) - (k_nb / 2) + 1): fseq = currStr[i - (k_nb / 2):i + (k_nb / 2)] rseq = currRevComp[len(currStr) - (k_nb / 2) - i:len(currStr) + (k_nb / 2) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal
def estimate_bias_kmer(args): # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prevPos: trueCounter += 1 else: prevPos = p1 trueCounter = 0 if trueCounter > maxDuplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - args.k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + args.k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round(float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round(float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk - 1, p2_wk - 2)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrName, p1_wk + 2, p2_wk + 1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [ e + min_value for e in bias_corrected_signal_forward ] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [ e + min_value for e in bias_corrected_signal_reverse ] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse
def bias_correction_atac(self, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p2_wk <= 0): # Return raw counts nf = [0.0] * (p2 - p1) nr = [0.0] * (p2 - p1) for read in self.bam.fetch(chrName, p1, p2): if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: nf[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: nr[cut_site - p1] += 1.0 return nf, nr # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) bias_corrected_signal_forward.append(nhatf) bias_corrected_signal_reverse.append(nhatr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal_forward, bias_corrected_signal_reverse
def bias_correction(self, signal, bias_table, genome_file_name, chrName, start, end, forward_shift, reverse_shift, strands_specific): """ Performs bias correction. Keyword arguments: signal -- Input signal. bias_table -- Bias table. Return: bias_corrected_signal -- Bias-corrected sequence. """ if (not bias_table): return signal # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(fBiasDict.keys()[0]) p1 = start p2 = end p1_w = p1 - (window / 2) p2_w = p2 + (window / 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if (p1 <= 0 or p1_w <= 0 or p1_wk <= 0): return signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in self.bam.fetch(chrName, p1_w, p2_w): if (not read.is_reverse): cut_site = read.pos + forward_shift if cut_site >= start and cut_site < end: nf[cut_site - p1_w] += 1.0 # for i in range(max(read.pos + forward_shift, start), min(read.pos + forward_shift + 1, end - 1)): # nf[i - start] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if cut_site >= start and cut_site < end: nr[cut_site - p1_w] += 1.0 # for i in range(max(read.aend + reverse_shift - 1, start), min(read.aend + reverse_shift, end - 1)): # nr[i - start] += 1.0 # if ((not read.is_reverse) and (read.pos > p1_w)): nf[read.pos - p1_w] += 1.0 # if ((read.is_reverse) and ((read.aend - 1) < p2_w)): nr[read.aend - 1 - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range((window / 2), len(nf) - (window / 2)): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + (window / 2)] fLast = nf[i - (window / 2) + 1] rSum -= rLast rSum += nr[i + (window / 2)] rLast = nr[i - (window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk-1, p2_wk-2)).upper() currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName,p1_wk+2, p2_wk+1)).upper()) #currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() #currRevComp = AuxiliaryFunctions.revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, # p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] bias_corrected_signal_forward = [] bias_corrected_signal_reverse = [] for i in range((window / 2), len(af) - (window / 2)): nhatf = Nf[i - (window / 2)] * (af[i] / fSum) nhatr = Nr[i - (window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal_forward.append(zf) bias_corrected_signal_reverse.append(zr) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + (window / 2)] fLast = af[i - (window / 2) + 1] rSum -= rLast rSum += ar[i + (window / 2)] rLast = ar[i - (window / 2) + 1] # Fixing the negative number in bias corrected signal min_value = abs(min(bias_corrected_signal_forward)) bias_fixed_signal_forward = [e + min_value for e in bias_corrected_signal_forward] min_value = abs(min(bias_corrected_signal_reverse)) bias_fixed_signal_reverse = [e + min_value for e in bias_corrected_signal_reverse] min_value = abs(min(bias_corrected_signal)) bias_fixed_signal = [e + min_value for e in bias_corrected_signal] # Termination fastaFile.close() if not strands_specific: return bias_corrected_signal else: return bias_fixed_signal_forward, bias_fixed_signal_reverse
def estimate_table_pwm(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. atac_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) obsSeqsF = [] obsSeqsR = [] expSeqsF = [] expSeqsR = [] # Iterating on HS regions for region in regions: # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions # if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift # else: p1 = r.aend - (k_nb/2) + 1 - shift if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if 'N' not in currStr: if (not r.is_reverse): obsSeqsF.append(Seq(currStr)) else: obsSeqsR.append(Seq(currStr)) # Evaluating expected frequencies # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): s = currStr[i:i + k_nb] if 'N' not in currStr: # Counting k-mer in dictionary expSeqsF.append(Seq(s)) # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] expSeqsR.append(Seq(s)) # Closing files bamFile.close() fastaFile.close() obsMotifsF = motifs.create(obsSeqsF) obsMotifsR = motifs.create(obsSeqsR) expMotifsF = motifs.create(expSeqsF) expMotifsR = motifs.create(expSeqsR) obsPwmF = obsMotifsF.pwm obsPwmR = obsMotifsR.pwm expPwmF = expMotifsF.pwm expPwmR = expMotifsR.pwm # Output logos logo_obs_f = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_obs_r = os.path.join( self.output_loc, "Bias", "logo", "obs_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) logo_exp_f = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_f.pdf".format(str(k_nb), str(forward_shift))) logo_exp_r = os.path.join( self.output_loc, "Bias", "logo", "exp_{}_{}_r.pdf".format(str(k_nb), str(forward_shift))) obsMotifsF.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) obsMotifsR.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) expMotifsF.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) expMotifsR.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Output pwms pwm_data_list = [obsPwmF, obsPwmR, expPwmF, expPwmR] pwm_file_list = [] pwm_obs_f = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_obs_r = os.path.join( self.output_loc, "Bias", "pwm", "obs_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_f = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_f.pwm".format(str(k_nb), str(forward_shift))) pwm_exp_r = os.path.join( self.output_loc, "Bias", "pwm", "exp_{}_{}_r.pwm".format(str(k_nb), str(forward_shift))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_data_list)): with open(pwm_file_list[i], "w") as f: f.write(str(pwm_data_list[i])) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obsF = self.get_pwm_score(k_mer, obsPwmF, k_nb) expF = self.get_pwm_score(k_mer, expPwmF, k_nb) bias_table_F[k_mer] = round(obsF / expF, 6) obsR = self.get_pwm_score(k_mer, obsPwmR, k_nb) expR = self.get_pwm_score(k_mer, expPwmR, k_nb) bias_table_R[k_mer] = round(obsR / expR, 6) # Return return [bias_table_F, bias_table_R]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, forward_shift, reverse_shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if (dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict() obsDictR = dict() expDictF = dict() expDictR = dict() ct_reads_r = 0 ct_reads_f = 0 ct_kmers = 0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if (not r.is_reverse): cut_site = r.pos + forward_shift - 1 p1 = cut_site - int(floor(k_nb / 2)) else: cut_site = r.aend + reverse_shift + 1 p1 = cut_site - int(floor(k_nb / 2)) p2 = p1 + k_nb # Verifying PCR artifacts if (p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if (trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if (r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if (not r.is_reverse): ct_reads_f += 1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_r += 1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str( fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0, len(currStr) - k_nb): ct_kmers += 1 # Counting k-mer in dictionary s = currStr[i:i + k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i + k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A", "C", "G", "T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e, 0.0) for e in kmerComb]) bias_table_R = dict([(e, 0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount if ct_reads_f == 0: bias_table_F[kmer] = 1 else: bias_table_F[kmer] = round( float(obsF / ct_reads_f) / float(expF / ct_kmers), 6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount if ct_reads_r == 0: bias_table_R[kmer] = 1 else: bias_table_R[kmer] = round( float(obsR / ct_reads_r) / float(expR / ct_kmers), 6) # Return return [bias_table_F, bias_table_R]
def estimate_table(self, regions, dnase_file_name, genome_file_name, k_nb, shift): """ Estimates bias based on HS regions, DNase-seq signal and genomic sequences. Keyword arguments: regions -- DNase-seq HS regions. dnase_file_name -- DNase-seq file name. genome_file_name -- Genome to fetch genomic sequences from. Return: bias_table_F, bias_table_R -- Bias tables. """ # Parameters maxDuplicates = 100 pseudocount = 1.0 # Initializing bam and fasta if(dnase_file_name.split(".")[-1].upper() != "BAM"): return None # TODO ERROR bamFile = Samfile(dnase_file_name, "rb") fastaFile = Fastafile(genome_file_name) # Initializing dictionaries obsDictF = dict(); obsDictR = dict() expDictF = dict(); expDictR = dict() ct_reads_r=0 ct_reads_f=0 ct_kmers=0 # Iterating on HS regions for region in regions: # Initialization prevPos = -1 trueCounter = 0 # Evaluating observed frequencies #################################### # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if(not r.is_reverse): p1 = r.pos - (k_nb/2) - 1 + shift else: p1 = r.aend - (k_nb/2) + 1 - shift p2 = p1 + k_nb # Verifying PCR artifacts if(p1 == prevPos): trueCounter += 1 else: prevPos = p1 trueCounter = 0 if(trueCounter > maxDuplicates): continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if(r.is_reverse): currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if(not r.is_reverse): ct_reads_r+=1 try: obsDictF[currStr] += 1 except Exception: obsDictF[currStr] = 1 else: ct_reads_f+=1 try: obsDictR[currStr] += 1 except Exception: obsDictR[currStr] = 1 # Evaluating expected frequencies #################################### # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue currRevComp = AuxiliaryFunctions.revcomp(currStr) # Iterating on each sequence position for i in range(0,len(currStr)-k_nb): ct_kmers+=1 # Counting k-mer in dictionary s = currStr[i:i+k_nb] try: expDictF[s] += 1 except Exception: expDictF[s] = 1 # Counting k-mer in dictionary for reverse complement s = currRevComp[i:i+k_nb] try: expDictR[s] += 1 except Exception: expDictR[s] = 1 # Closing files bamFile.close() fastaFile.close() # Creating bias dictionary alphabet = ["A","C","G","T"] kmerComb = ["".join(e) for e in product(alphabet, repeat=k_nb)] bias_table_F = dict([(e,0.0) for e in kmerComb]) bias_table_R = dict([(e,0.0) for e in kmerComb]) for kmer in kmerComb: try: obsF = obsDictF[kmer] + pseudocount except Exception: obsF = pseudocount try: expF = expDictF[kmer] + pseudocount except Exception: expF = pseudocount bias_table_F[kmer] = round(float(obsF/ct_reads_f)/float(expF/ct_kmers),6) try: obsR = obsDictR[kmer] + pseudocount except Exception: obsR = pseudocount try: expR = expDictR[kmer] + pseudocount except Exception: expR = pseudocount bias_table_R[kmer] = round(float(obsR/ct_reads_r)/float(expR/ct_kmers),6) # Return return [bias_table_F, bias_table_R]
def estimate_bias_pwm(args): # Parameters max_duplicates = 100 # Initializing bam and fasta bamFile = Samfile(args.reads_file, "rb") genome_data = GenomeData(args.organism) fastaFile = Fastafile(genome_data.get_genome()) regions = GenomicRegionSet("regions") regions.read(args.regions_file) obs_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_f_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) obs_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) exp_r_pwm_dict = dict([("A", [0.0] * args.k_nb), ("C", [0.0] * args.k_nb), ("G", [0.0] * args.k_nb), ("T", [0.0] * args.k_nb), ("N", [0.0] * args.k_nb)]) # Iterating on HS regions for region in regions: # Initialization prev_pos = -1 true_counter = 0 # Evaluating observed frequencies # Fetching reads for r in bamFile.fetch(region.chrom, region.initial, region.final): # Calculating positions if not r.is_reverse: cut_site = r.pos + args.forward_shift - 1 p1 = cut_site - int(floor(args.k_nb / 2)) else: cut_site = r.aend + args.reverse_shift + 1 p1 = cut_site - int(floor(args.k_nb / 2)) p2 = p1 + args.k_nb # Verifying PCR artifacts if p1 == prev_pos: true_counter += 1 else: prev_pos = p1 true_counter = 0 if true_counter > max_duplicates: continue # Fetching k-mer try: currStr = str(fastaFile.fetch(region.chrom, p1, p2)).upper() except Exception: continue if r.is_reverse: currStr = AuxiliaryFunctions.revcomp(currStr) # Counting k-mer in dictionary if not r.is_reverse: for i in range(0, len(currStr)): obs_f_pwm_dict[currStr[i]][i] += 1 else: for i in range(0, len(currStr)): obs_r_pwm_dict[currStr[i]][i] += 1 # Evaluating expected frequencies # Fetching whole sequence try: currStr = str(fastaFile.fetch(region.chrom, region.initial, region.final)).upper() except Exception: continue # Iterating on each sequence position s = None for i in range(0, len(currStr) - args.k_nb): # Counting k-mer in dictionary s = currStr[i:i + args.k_nb] for i in range(0, len(s)): exp_f_pwm_dict[s[i]][i] += 1 # Counting k-mer in dictionary for reverse complement s = AuxiliaryFunctions.revcomp(s) for i in range(0, len(s)): exp_r_pwm_dict[s[i]][i] += 1 # Closing files bamFile.close() fastaFile.close() # Output pwms os.system("mkdir -p " + os.path.join(args.output_location, "pfm")) pwm_dict_list = [obs_f_pwm_dict, obs_r_pwm_dict, exp_f_pwm_dict, exp_r_pwm_dict] pwm_file_list = [] pwm_obs_f = os.path.join(args.output_location, "pfm", "obs_{}_f.pfm".format(str(args.k_nb))) pwm_obs_r = os.path.join(args.output_location, "pfm", "obs_{}_r.pfm".format(str(args.k_nb))) pwm_exp_f = os.path.join(args.output_location, "pfm", "exp_{}_f.pfm".format(str(args.k_nb))) pwm_exp_r = os.path.join(args.output_location, "pfm", "exp_{}_r.pfm".format(str(args.k_nb))) pwm_file_list.append(pwm_obs_f) pwm_file_list.append(pwm_obs_r) pwm_file_list.append(pwm_exp_f) pwm_file_list.append(pwm_exp_r) for i in range(len(pwm_dict_list)): with open(pwm_file_list[i], "w") as pwm_file: for e in ["A", "C", "G", "T"]: pwm_file.write(" ".join([str(int(f)) for f in pwm_dict_list[i][e]]) + "\n") motif_obs_f = motifs.read(open(pwm_obs_f), "pfm") motif_obs_r = motifs.read(open(pwm_obs_r), "pfm") motif_exp_f = motifs.read(open(pwm_exp_f), "pfm") motif_exp_r = motifs.read(open(pwm_exp_r), "pfm") # Output logos os.system("mkdir -p " + os.path.join(args.output_location, "logo")) logo_obs_f = os.path.join(args.output_location, "logo", "obs_{}_f.pdf".format(str(args.k_nb))) logo_obs_r = os.path.join(args.output_location, "logo", "obs_{}_r.pdf".format(str(args.k_nb))) logo_exp_f = os.path.join(args.output_location, "logo", "exp_{}_f.pdf".format(str(args.k_nb))) logo_exp_r = os.path.join(args.output_location, "logo", "exp_{}_r.pdf".format(str(args.k_nb))) motif_obs_f.weblogo(logo_obs_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_obs_r.weblogo(logo_obs_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.2, yaxis_tic_interval=0.1) motif_exp_f.weblogo(logo_exp_f, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) motif_exp_r.weblogo(logo_exp_r, format="pdf", stack_width="large", color_scheme="color_classic", yaxis_scale=0.02, yaxis_tic_interval=0.01) # Creating bias dictionary alphabet = ["A", "C", "G", "T"] k_mer_comb = ["".join(e) for e in product(alphabet, repeat=args.k_nb)] bias_table_F = dict([(e, 0.0) for e in k_mer_comb]) bias_table_R = dict([(e, 0.0) for e in k_mer_comb]) for k_mer in k_mer_comb: obs_f = get_ppm_score(k_mer, motif_obs_f.pwm, args.k_nb) exp_f = get_ppm_score(k_mer, motif_exp_f.pwm, args.k_nb) bias_table_F[k_mer] = round(obs_f / exp_f, 6) obs_r = get_ppm_score(k_mer, motif_obs_r.pwm, args.k_nb) exp_r = get_ppm_score(k_mer, motif_exp_r.pwm, args.k_nb) bias_table_R[k_mer] = round(obs_r / exp_r, 6) write_table(args.output_location, args.output_prefix, [bias_table_F, bias_table_R])
outputFile4.write("\t".join(header4) + "\n") for i in range(0, maxV): vec = [] for j in range(0, len(vectorTable4)): try: vec.append(vectorTable4[j][i]) except Exception: vec.append("NA") try: vec.append(vectorTable5[j][i]) except Exception: vec.append("NA") outputFile4.write("\t".join(vec) + "\n") stagFile.close() outputFile1.close() outputFile2.close() outputFile3.close() outputFile4.close() genomeFile.close() regionsFile.close() #chrommHmmFile.close() enhancersFile.close() [e.close() for e in signalFileList] [e.close() for e in controlFileList] [e.close() for e in motifFileList] # Removing all files command = "rm -rf " + tempLocation os.system(command)
def bias_correction(chrom, start, end, bam, bias_table, genome_file_name, forward_shift, reverse_shift): # Parameters window = 50 defaultKmerValue = 1.0 # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(list(fBiasDict.keys())[0]) p1 = start p2 = end p1_w = p1 - (window // 2) p2_w = p2 + (window // 2) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0 or p2_wk <= 0: # Return raw counts bc_signal = [0.0] * (p2 - p1) for read in bam.fetch(chrom, p1, p2): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1 <= cut_site < p2: bc_signal[cut_site - p1] += 1.0 return bc_signal # Raw counts nf = [0.0] * (p2_w - p1_w) nr = [0.0] * (p2_w - p1_w) for read in bam.fetch(chrom, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] f_sum = sum(nf[:window]) r_sum = sum(nr[:window]) f_last = nf[0] r_last = nr[0] for i in range(int(window / 2), len(nf) - int(window / 2)): Nf.append(f_sum) Nr.append(r_sum) f_sum -= f_last f_sum += nf[i + int(window / 2)] f_last = nf[i - int(window / 2) + 1] r_sum -= r_last r_sum += nr[i + int(window / 2)] r_last = nr[i - int(window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrom, p1_wk, p2_wk - 1)).upper() currRevComp = AuxiliaryFunctions.revcomp( str(fastaFile.fetch(chrom, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int(floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file f_sum = sum(af[:window]) r_sum = sum(ar[:window]) f_last = af[0] r_last = ar[0] bc_signal = [] for i in range(int(window / 2), len(af) - int(window / 2)): nhatf = Nf[i - int(window / 2)] * (af[i] / f_sum) nhatr = Nr[i - int(window / 2)] * (ar[i] / r_sum) bc_signal.append(nhatf + nhatr) f_sum -= f_last f_sum += af[i + int(window / 2)] f_last = af[i - int(window / 2) + 1] r_sum -= r_last r_sum += ar[i + int(window / 2)] r_last = ar[i - int(window / 2) + 1] # Termination fastaFile.close() return bc_signal
def bias_correction_dnase(signal_class, signal, chrName, start, end, forward_shift, reverse_shift): table_file_name_F = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_F.txt') table_file_name_R = os.path.join(os.path.dirname(__file__), '../data/single_hit_bias_table_R.txt') bias_table = load_table(table_file_name_F, table_file_name_R) if not bias_table: return signal # Parameters window = 50 defaultKmerValue = 1.0 genome_file_name = signal_class.fastaFile # Initialization fastaFile = Fastafile(genome_file_name) fBiasDict = bias_table[0] rBiasDict = bias_table[1] k_nb = len(list(fBiasDict.keys())[0]) p1 = start p2 = end p1_w = int(p1 - (window / 2)) p2_w = int(p2 + (window / 2)) p1_wk = p1_w - int(floor(k_nb / 2.)) p2_wk = p2_w + int(ceil(k_nb / 2.)) if p1 <= 0 or p1_w <= 0 or p1_wk <= 0: return signal # Raw counts nf = [0.0] * int(p2_w - p1_w) nr = [0.0] * int(p2_w - p1_w) for read in signal_class.bam.fetch(chrName, p1_w, p2_w): # check if the read is unmapped, according to issue #112 if read.is_unmapped: continue if not read.is_reverse: cut_site = read.pos + forward_shift if p1_w <= cut_site < p2_w: nf[cut_site - p1_w] += 1.0 else: cut_site = read.aend + reverse_shift - 1 if p1_w <= cut_site < p2_w: nr[cut_site - p1_w] += 1.0 # Smoothed counts Nf = [] Nr = [] fSum = sum(nf[:window]) rSum = sum(nr[:window]) fLast = nf[0] rLast = nr[0] for i in range(int(window / 2), int(len(nf) - (window / 2))): Nf.append(fSum) Nr.append(rSum) fSum -= fLast fSum += nf[i + int(window / 2)] fLast = nf[i - int(window / 2) + 1] rSum -= rLast rSum += nr[i + int(window / 2)] rLast = nr[i - int(window / 2) + 1] # Fetching sequence currStr = str(fastaFile.fetch(chrName, p1_wk, p2_wk - 1)).upper() currRevComp = revcomp(str(fastaFile.fetch(chrName, p1_wk + 1, p2_wk)).upper()) # Iterating on sequence to create signal af = [] ar = [] for i in range(int(ceil(k_nb / 2.)), len(currStr) - int(floor(k_nb / 2)) + 1): fseq = currStr[i - int(floor(k_nb / 2.)):i + int(ceil(k_nb / 2.))] rseq = currRevComp[ len(currStr) - int(ceil(k_nb / 2.)) - i:len(currStr) + int( floor(k_nb / 2.)) - i] try: af.append(fBiasDict[fseq]) except Exception: af.append(defaultKmerValue) try: ar.append(rBiasDict[rseq]) except Exception: ar.append(defaultKmerValue) # Calculating bias and writing to wig file fSum = sum(af[:window]) rSum = sum(ar[:window]) fLast = af[0] rLast = ar[0] bias_corrected_signal = [] for i in range(int(window / 2), int(len(af) - (window / 2))): nhatf = Nf[i - int(window / 2)] * (af[i] / fSum) nhatr = Nr[i - int(window / 2)] * (ar[i] / rSum) zf = log(nf[i] + 1) - log(nhatf + 1) zr = log(nr[i] + 1) - log(nhatr + 1) bias_corrected_signal.append(zf + zr) fSum -= fLast fSum += af[i + int(window / 2)] fLast = af[i - int(window / 2) + 1] rSum -= rLast rSum += ar[i + int(window / 2)] rLast = ar[i - int(window / 2) + 1] # Termination fastaFile.close() return bias_corrected_signal