def rvClonal(self, cut=1e-10): ''' Description: Remove clonal reads, not that this process may change the difference between samples. If dno't want to change the difference, transfer all samples to wigs and use the rvClonal function of the Wigs class. Parameter: cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer. Value: None ''' ss = time() cut = float(cut) if cut <= 0: return print('removing clonal reads ...') avg = functions.div( self.mean(), self.step ) ##### '*self.step' is added by Kaifu on Aug 1st, 2012 ##### and is further changed to be /self.step on May29, 2014 if cut > 0 and cut < 1: ppois = r( '''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}''' ) lgpcut = 0 - functions.div(log(cut), log(10)) cut = float(avg + 0.5) # removed this line below since rpy is not efficient #while(0-(functions.div(float(str(ppois(cut,float(avg))).split()[-1]),log(10)))<lgpcut):cut+=1 pois_dist = functions.ppois(cut, float(avg), lower_tail=False, log_bool=True) while (0 - (functions.div(pois_dist, log(10))) < lgpcut): cut += 1 pois_dist = functions.ppois(cut, float(avg), lower_tail=False, log_bool=True) if cut < 1: cut = 1 print('whole genome average reads density is', avg, 'use cutoff:', cut) cut *= self.step ##### added by Kaifu on May29, 2014 cnum = 0 rreads = 0 treads = 0 tchrv = numpy.array([0.0]) before = self.sum() print('before removing:', before, 'reads') for chm in self.data: for stra in list(self.data[chm].keys()): tchrv = deepcopy(self.data[chm][stra]) tchrv -= cut #all positive values are count of clonal reads tchrv = functions.div(((tchrv**2)**0.5 + tchrv), 2) #remove all neative values self.data[chm][stra] -= tchrv after = self.sum() print('after removing:', after, 'reads') print(functions.div((before - after) * 100, before), 'percent removed.') print('time cost:', time() - ss)
def loadBam(self, file="", step=10, cut=1e-10): ''' Description: load single-end reads data from a file in '.bam' format parameter: file: a path to the file containing the sequencing reads step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer. Value: None ''' print('\nparsing from Bam file', file, '...') oldchr = "" sizes = {} num = 0 sam_reader = pysam.AlignmentFile(file, "rb") for sam in sam_reader.fetch(): try: # skip unmapped reads if sam.is_unmapped: continue name = sam.query_name chm = sam.reference_name stra = '+' start = int(sam.reference_start + 1) rlen = sam.reference_length # set `stra` to "-" for reverse reads if sam.is_reverse: stra = '-' end, score = start + rlen, '1' except: print(name) #'wrong line:',line[:-1] continue num += 1 if num % 1000000 == 0: print(num, 'reads parsed') if stra == '+': mid = int(functions.div(start, step)) elif stra == '-': mid = int(functions.div(end, step)) if chm not in self.data: self.data[chm] = { '+': numpy.array([0.0]), '-': numpy.array([0.0]) } sizes[chm] = 0 if mid >= sizes[chm]: sizes[chm] = mid + 1000 self.data[chm]['+'].resize(sizes[chm], refcheck=0) self.data[chm]['-'].resize(sizes[chm], refcheck=0) if mid >= 0: self.data[chm][stra][mid] += 1.0 if num == 0: sys.exit("ERROR: Empty Bam File") self.clearEmptyEnd() for chm in self.data: lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size) self.data[chm]['+'].resize(lth, refcheck=0) self.data[chm]['-'].resize(lth, refcheck=0) if cut > 0: self.rvClonal(cut=cut) print('parsing finished,', num, 'reads parsed')
def samplingNormalize(self): ''' Description: Normalize between Wig class instances by sampling to same coverage Parameter: None Value: None ''' from random import randint ss = time() wigs = self.data wsum = {} tarray = numpy.array([0.0]) for wig in wigs: wsum[wig] = wigs[wig].sum() asum = functions.div(sum(wsum.values()), len(list(wsum.keys()))) for k in wigs: wig = wigs[k] oldsum = wig.sum() num = oldsum - asum if num < 0: num = 0 - num else: num = asum for chr in wig.data: tarray.resize(int(wig.data[chr].sum()), refcheck=0) tarray, csz, i, tsz = tarray * 0, wig.data[chr].size, 0, 0 while i < csz: newtsz = int(tsz + wig.data[chr][i] + 0.5) if newtsz >= tarray.size: tarray.resize(newtsz + 1000, refcheck=0) while tsz < newtsz: tarray[tsz] = i tsz += 1 i += 1 i, tnum, tsz = 0, int( functions.div( (functions.div(num * wig.chrSum(chr), oldsum)), wig.step)), tsz if oldsum > asum: wig.data[chr] *= 0 while i < tnum: i += 1 wig.data[chr][tarray[randint(0, tsz - 1)]] += 1 sys.stdout.write('time cost' + str(time() - ss) + "\n") return 1
def toWig(self, fs=None, extend=0, mifrsz=10, mafrsz=300): ''' Description: Calculate nucleosome occupancy from the reads data Parameter: fs: average size of fragments that are subject to sequencing and generate the reads, only for signgle-end reads. When this value is not given, a fs value will be infered by the program. For paired-end reads loaded buy the function loadBedPaired(), set fs to 0. extend: a interger value, each read will be extend to this length. mifrsz: the minimal estimated average fragment size, only for single-end reads mafrsz: the maximal estimated average fragment size, only for single-end reads Value: a Wig class instance ''' step = self.step if fs == None: fs = self.fragSizeDis(minsize=mifrsz, maxsize=mafrsz) if extend <= 0: extend = fs print('extend to', extend) old_extend = extend fragsize, extend = functions.div(fs, (2 * step)), functions.div( extend, (2 * step)) wg = Wig(step=step) print('generating wig ...') for chm in self.data: tmax = max(1000, fragsize * 4, extend * 4) if self.data[chm]['+'].size < tmax: self.data[chm]['+'].resize(tmax, refcheck=0) if self.data[chm]['-'].size < tmax: self.data[chm]['-'].resize(tmax, refcheck=0) wg.addChr(chm) lth = int(self.data[chm]['+'].size) wg.resizeChr(chm, lth * step) self.data[chm]['+'][fragsize:lth] = self.data[chm]['+'][0:( lth - fragsize)] for i in range(fragsize): self.data[chm]['+'][i] = 0 self.data[chm]['+'][0:( lth - fragsize)] += self.data[chm]['-'][fragsize:lth] for p in range(-extend, extend + 1): wg.data[chm][extend:(lth - extend)] += self.data[chm]['+'][( extend + p):(lth - extend + p)] wg.foldChange(functions.div( old_extend * 1.0, wg.step)) ##### added by Kaifu on May29, 2014 return wg
def testDiv(operands): sys.path.append('project/src/calculate-activity') import functions as fun # invoke the div method for the functions.py file test = fun.div(operands[0], operands[1]) # return the result return test
def mean(self): ''' Description: calculate the average reads count per nucleotide Parameter: None Value: None ''' return functions.div(self.sum() * 1.0, self.size())
def ajClonal(self,cut=1e-10,extend=1): ###### add by Kaifu on Nov 14, 2012 ''' Description: Adjust clonal reads count, fold change between samples will not be altered in this process. Parameter: cut: the cutoff used to define clonal reads. When it is interger, a read count larger than cut will be defined as clonal; when it is float, a read count that is larger than mean count by a Poisson test P value < cut will be defined as clonal. fsz: the extension length of each read that is used to calculate the wiggle file. Extension length means the length from 5' end to 3' end, e.g. a read may be 36bp when it is generated by the sequencing machine, but it might have been extended to be 80bp or cutted to be 1 bp, so the extension length will then be 80bp or 1bp. Value: None Note: all wiggle file in a Wigs object must have the same step size. ''' #sys.stdout.write '\nremoving clonal singal ...' ks=list(self.keys()) m=deepcopy(self.get(ks[0])) if len(ks)>1: for k in ks[1:]:m.add(self.get(k)) m.foldChange(functions.div(1.0,len(ks))) avg=m.mean() from math import log10,log if cut=='0':return else: if float(cut)>=1:cut=float(cut) else: co=cut.split('-') if len(co)==2:cut=float(co[1])-log10(float(co[0][:-1])) else:cut=0-log10(float(co[0])) lgpcut=cut cut=int(avg+0.5) ppois=r('''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}''') while(0-(functions.div(float(str(ppois(functions.div(cut*1.0,extend),functions.div(avg*1.0,extend))).split()[-1]),log(10)))<lgpcut):cut+=1 sys.stdout.write('aveage density is ' + str(avg) + ', use clonal signal cutoff ' + str(cut) + "\n") ks=list(self.keys()) for chr in m.getChrs(): tchrv=deepcopy(m.data[chr]) tchrv-=cut#all positive values are count of clonal reads tchrv=functions.div(((tchrv**2)**0.5+tchrv),2)#remove all neative values tchrv=m.data[chr]-tchrv+numpy.log(tchrv+1)# the addition of '1' is to avoid log(0),"+numpy.log(tchrv+1)" is used to keep the rank order values sys.stdout.write(chr+":") for k in ks: twg=self.get(k) if chr not in twg.data:continue temp=twg.data[chr].sum() sys.stdout.write('\t'+str(k),'reduced from',temp,'to ') if chr not in twg.data:twg.data[chr]=numpy.array([0.0]) if twg.data[chr].size!=tchrv.size:twg.data[chr].resize(tchrv.size,refcheck=0) twg.data[chr]=functions.div(tchrv*twg.data[chr],(m.data[chr]+1e-100)) # the addition of '1e-100' is to avoid devide by 0 sys.stdout.write(twg.data[chr].sum()+ ', percent removed: '+str(100-functions.div(twg.data[chr].sum()*100.0,temp))+"\n")
def autocorrelation(self, ofile, minsize=10, maxsize=300): ''' Description: Calculate correlation between the two strands allowing shift distances. Parameters: ofile: the output file used to write the result to minsize: minimal shift distance maxsize: maximal shift distance Value: an dictionary with shift distance as key and correlation coefficient correspondint to the distance ''' step = self.step minsize = functions.div(minsize, step) maxsize = functions.div(maxsize, step) + 1 print('calculating ...') dic = {} for i in range(int(minsize), int(maxsize)): dic[i] = 0 szsum = 0 for chm in self.data: sz = self.data[chm]['+'].size - maxsize c = self.data[chm]['+'][:sz] * self.data[chm]['-'][i:(sz + i)] dic[i] += sz * functions.div( (c.mean() - self.data[chm]['+'][:sz].mean() * self.data[chm]['-'][i:(sz + i)].mean()), (self.data[chm]['+'][:sz].std() * self.data[chm]['-'][i:(sz + i)].std())) szsum += sz dic[i] = functions.div(dic[i], (szsum * 1.0)) ks = list(dic.keys()) ks.sort() fo = open(ofile, 'w') for k in ks: fo.write(str(k) + '\t' + str(dic[k]) + '\n') return dic
def testDiv(x, y): #get Directory for functions currentworkingdirectory = os.getcwd() currentworkingdirectory = currentworkingdirectory.replace('/testCasesExecutables', '') currentworkingdirectory = (currentworkingdirectory + '/project/src') sys.path.insert(0, currentworkingdirectory) from functions import div try: output = div(int(x),int(y)) except: output = "ERROR" return output
def foldNormalize(self,scalepairs=None,sampling_total=None,nonzero=False): ''' Description: Normalize between Wig class instances by fold change Parameter: None Value: None ''' ss=time() wigs=self.data names=list(wigs.keys()) names.sort() if sampling_total==None: wsum={} for wig in wigs:wsum[wig]=wigs[wig].sum() asum=functions.div(sum(wsum.values()),len(list(wsum.keys()))) for wig in names: sys.stdout.write(wig + ' from ' + str(wigs[wig].sum()) + ' to ') if scalepairs==None:wigs[wig].foldChange(functions.div(asum*1.0,wsum[wig])) else:wigs[wig].foldChange(functions.div(scalepairs[wig]*1.0,wsum[wig])) sys.stdout.write(str(wigs[wig].sum()) + "\n") else: average_total=functions.div(sum(sampling_total.values()),len(list(sampling_total.keys()))) for name in names: sys.stdout.write(name + ' from ' + str(wigs[name].sum()) + ' to ') if scalepairs==None:wigs[name].foldChange(functions.div(average_total*1.0,sampling_total[name])) else: wigs[name].foldChange(functions.div(scalepairs[name],sampling_total[name])) sys.stdout.write(str(wigs[name].sum()) + "\n") if nonzero: sys.stdout.write('further correction based on count of non-zero base pairs\n') gsizes,non0sizes={},{} for wig in wigs: gsizes[wig]=wigs[wig].gsize() non0sizes[wig]=wigs[wig].non0size() agsize=functions.div(sum(gsizes.values())*1.0,len(list(gsizes.keys()))) for wig in wigs: sys.stdout.write(wig + ' from ' + str(wigs[wig].sum()) + ' to ') wigs[wig].foldChange(functions.div(non0sizes[wig],agsize)) sys.stdout.write(str(wigs[wig].sum()) + 'based on non0size' + non0sizes[wig] + 'and genome size' + agsize + "\n") return 1
def refquantile(paths,ofile,gfile): print('\nPreparing reference ...') tm=time() files=paths.split(':') fi={} fo=open(ofile,'w') for file in files:fi[file]=open(file) nfile=len(files) for line in fi[files[0]]: col=line.split() v=float(col[0]) for file in files[1:]: add_line=fi[file].readline() v+=float(add_line.split()[0]) fo.write(str(div(v,nfile))+'\t-\t-\t-\n') fo.close() print('time cost:',time()-tm)
def changevalue(ifile,ref,ofile,gfile,step=10,suppress=False,buffer=None): from random import randint if ifile!=ref:print('\nnormalizing',ifile,'...') else:print('\nsaving reference ...') tm=time() fi,fr=open(ifile),open(ref) wg=Wig(step=step,gfile=gfile) for line in fi: col=line.split() rcol=fr.readline().split() if len(rcol)==0:rcol=[0.0] cr,pos,vl=col[2],div(int(col[3]),step),float(rcol[0]) wg.data[cr][pos]=vl n=0 for line in fr:n+=1 if n>0:print('Warning: the input genome size is smaller than the reference genome size by',n,'wiggle steps!') wg.save(ofile,suppress=suppress) print('time cost:',time()-tm)
def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False): ''' Description: caculate the sum of each wig's values after excluding the low and high percentile Parameter: None Value: None ''' #if exclude_low_percent==0 and exclude_high_percent==0:return None #else: #print 'calculating normalization factors by sampling ...' names=list(self.data.keys()) if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None sampling_total={} if region_file==None: sys.stdout.write('calculate total signal in each sample after excluding the top ' + str(exclude_high_percent) + ' and bottom ' + str(exclude_low_percent) + 'percents of genomic regions with extremely high and low signal values\n') wsums={} for name in names:wsums[name]=self.data[name].sum() wavg=functions.div(sum(wsums.values()),len(list(wsums.values()))) rfwig=deepcopy(self.data[names[0]]) rfwig.foldChange(functions.div(wavg*1.0,wsums[names[0]])) for name in names[1:]: self.data[name].foldChange(functions.div(wavg*1.0,wsums[name])) rfwig.add(self.data[name]) self.data[name].foldChange(functions.div(wsums[name]*1.0,wavg)) rfwig.foldChange(functions.div(1.0,len(names))) lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero) rg=rfwig.regionWithinValueRange(lowcut,highcut) if region_out_file!=None:rg.save(region_out_file) else: sys.stdout.write('calculate total signal in each sample in genomic regions defined by' + region_file + "\n") rg=Wig(region_file) for name in names:sampling_total[name]=self.data[name].multiply(rg).sum() sys.stdout.write(str(rg.sum()) + ' (' + str(functions.div(rg.sum()*100.0,rg.gsize())) + ' %) of ' + str(rg.gsize()) + ' base pairs calculated:\n') for name in names: sys.stdout.write(name + str(sampling_total[name]) + ' (' + str(functions.div(sampling_total[name]*100.0,self.data[name].sum())) + '% of total)\n') return sampling_total
def sizeAdjust(self, gfile): ''' Description: Adjust the size of each chrosome. Parameter: gfile: path to the file containing the size of each chrosome, each line in the file would be in the format "chrosome_name size", in which size is an integer value, and chrosome_name should contain no empty space value: None. ''' sizes = {} for line in open(gfile): col = line.split() sizes[col[0]] = functions.div(int(col[1]), self.step) for chm in self.data: if chm not in sizes: self.data.pop(chm) for str in self.data[chm]: self.data[chm][str].resize(sizes[chm], refcheck=0)
def test_div_four_by_two(self): self.assertEqual(div('4', '2'), 2)
def test_divide_by_zero(self): self.assertIsNone(div(1, 0))
def loadBamPaired(self, file="", step=10, cut=1e-10): ''' Description: load paired-end reads data from a file in '.bam' format parameter: file: a path to the file containing the sequencing reads step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer. Value: None ''' print('\nparsing from bam file', file, '...') oldchr = "" sizes = {} fragsizes = {} num = 0 sam_reader = pysam.AlignmentFile(file, "rb") for sam in sam_reader.fetch(): try: if not sam.is_proper_pair: continue ref_start = int(sam.reference_start + 1) next_ref_start = int(sam.next_reference_start + 1) rlen = sam.reference_length name = sam.query_name chm = sam.reference_name stra = '+' score = '1' mid = int( functions.div((ref_start + next_ref_start + rlen), (2 * step))) if sam.is_reverse: stra = '-' if next_ref_start > ref_start: fragsize = next_ref_start - ref_start + rlen else: fragsize = ref_start - next_ref_start + rlen if fragsize not in fragsizes: fragsizes[fragsize] = 1 else: fragsizes[fragsize] += 1 except: print(sam) #'wrong line:',line[:-1] sys.exit("SOME REASON DID NOT WORK...") continue num += 1 if num % 1000000 == 0: print(num, 'reads parsed') if chm not in self.data: self.data[chm] = { '+': numpy.array([0.0]), '-': numpy.array([0.0]) } sizes[chm] = 0 if mid >= sizes[chm]: sizes[chm] = mid + 1000 self.data[chm]['+'].resize(sizes[chm], refcheck=0) self.data[chm]['-'].resize(sizes[chm], refcheck=0) if mid >= 0: self.data[chm][stra][mid] += 1.0 if num == 0: sys.exit("ERROR: Empty Bam File") self.clearEmptyEnd() for chm in self.data: lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size) self.data[chm]['+'].resize(lth, refcheck=0) self.data[chm]['-'].resize(lth, refcheck=0) if float(cut) > 0: self.rvClonal(cut=cut) print('parsing finished,', num, 'reads parsed') maxv, lths = max(fragsizes.values()), list(fragsizes.keys()) lths.sort() tlth, count = 0, 0 maxlth = [] for lth in lths: if fragsizes[lth] == maxv: maxlth.append(lth) tlth += lth * fragsizes[lth] count += fragsizes[lth] dcount = int(np.ceil(functions.div(100 * fragsizes[lth], maxv))) if dcount > 4: print('-' * int(dcount), lth, fragsizes[lth]) print('average fragment size:', functions.div(tlth * 1.0, count)) print('most enriched fragment size:', maxlth)
from functions import add, sub, mult, div x = add(8, 9) print(x) x = sub(x, 3) print(x) x = mult(x, 4) print(x) x = div(x, 2) print(x)
def loadBedPaired(self, file="", step=10, cut=1e-10): ''' Description: load paired-end reads data from a file in '.bed' format, the "name" fields of each reads pair must be the same except for the last one character in each field, which should be either '1' or '2', e.g. 'reads/1' or 'reads/2', each pair of reads must be arranged in two neighboring lines. parameter: file: a path to the file containing the sequencing reads step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer. Value: None ''' print('\nparsing from bed file', file, '...') oldchr = "" sizes = {} fragsizes = {} num = 0 serr, nerr = 0, 0 infile = open(file) if file[-2:] == 'gz': infile = gzip.open(file) end1, end2 = [], [] for line in infile: try: col = line[:-1].split('\t') tnames = col[3].split() if len(tnames) == 2: col[3] = tnames[0] + col[3][ -1] ########## add by kaifu on sep 5, 2012 ########## some time the reads name have two words seperated by a space (not '\t') col[1], col[2] = int(col[1]), int(col[2]) if len(end1) < 1: end1 = col num += 1 if num % 1000000 == 0: print(num, 'reads parsed') continue else: end2 = col num += 1 if num % 1000000 == 0: print(num, 'reads parsed') except: print('wrong line:', line[:-1]) continue if end1[3][:-1] == end2[3][:-1]: if end1[5] == '+' and end2[5] == '-': chm, mid, fragsize = end1[0], int( functions.div((end1[1] + end2[2]), (2 * step))), end2[2] - end1[1] elif end1[5] == '-' and end2[5] == '+': chm, mid, fragsize = end1[0], int( functions.div((end1[2] + end2[1]), (2 * step))), end2[1] - end1[2] else: #print 'pair error --- reads from same strand:\n',end1,'\n',end2,'\n' serr += 1 end1, end2 = [], [] continue if fragsize not in fragsizes: fragsizes[fragsize] = 1 else: fragsizes[fragsize] += 1 else: #print 'pair error --- single end reads:\n',end1,'\n' nerr += 1 end1 = end2 end2 = [] continue if chm not in self.data: self.data[chm] = { '+': numpy.array([0.0]), '-': numpy.array([0.0]) } sizes[chm] = 0 if mid >= sizes[chm]: sizes[chm] = mid + 1000 self.data[chm]['+'].resize(sizes[chm], refcheck=0) self.data[chm]['-'].resize(sizes[chm], refcheck=0) if mid > 0: self.data[chm]['+'][mid] += 1.0 self.data[chm]['-'][mid] += 1.0 end1, end2 = [], [] self.clearEmptyEnd() for chm in self.data: lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size) self.data[chm]['+'].resize(lth, refcheck=0) self.data[chm]['-'].resize(lth, refcheck=0) print('parsing finished,', num, 'reads parsed') maxv, lths = max(fragsizes.values()), list(fragsizes.keys()) lths.sort() tlth, count = 0, 0 maxlth = [] for lth in lths: if fragsizes[lth] == maxv: maxlth.append(lth) tlth += lth * fragsizes[lth] count += fragsizes[lth] dcount = np.ceil(functions.div(100 * fragsizes[lth], maxv)) print(dcount) if dcount > 4: print('-' * int(dcount), lth, fragsizes[lth]) print('average fragment size:', functions.div(tlth * 1.0, count)) print('most enriched fragment size:', maxlth) print(serr, 'pairs failed due to locations on same strands') print(nerr, 'reads have no mate reads') if cut > 0: self.rvClonal(cut=cut)
def loadBed(self, file="", step=10, cut=1e-10): #,loadcount=0): ''' Description: load single-end reads data from a file in '.bed' format parameter: file: a path to the file containing the sequencing reads step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer. Value: None ''' print('\nparsing from bed file', file, '...') oldchr = "" sizes = {} num = 0 infile = open(file) if file[-2:] == 'gz': infile = gzip.open(file) for line in infile: try: col = line.split() #if len(col)==5:chm,start,end,name,stra=col[0:5] #else: chm, start, end, name, score, stra = col[0:6] start, end = int(start), int(end) except: print('wrong format:', line.split()) #'wrong line:',line[:-1] continue #if col[0]!='chr1':continue ################################################ just for test ################################################ num += 1 ''' if loadcount>0: if num>loadcount: self.clearEmptyEnd() for chm in self.data: lth=max(self.data[chm]['+'].size,self.data[chm]['-'].size) self.data[chm]['+'].resize(lth,refcheck=0) self.data[chm]['-'].resize(lth,refcheck=0) if cut>0:self.rvClonal(cut=cut) print 'parsing finished,',num-1,'reads parsed' return ''' if num % 1000000 == 0: print(num, 'reads parsed') if stra == '+': mid = int(functions.div(start, step)) elif stra == '-': mid = int(functions.div(end, step)) if chm not in self.data: self.data[chm] = { '+': numpy.array([0.0]), '-': numpy.array([0.0]) } sizes[chm] = 0 if mid >= sizes[chm]: sizes[chm] = mid + 1000 self.data[chm]['+'].resize(sizes[chm], refcheck=0) self.data[chm]['-'].resize(sizes[chm], refcheck=0) if mid >= 0: self.data[chm][stra][mid] += 1.0 self.clearEmptyEnd() for chm in self.data: lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size) self.data[chm]['+'].resize(lth, refcheck=0) self.data[chm]['-'].resize(lth, refcheck=0) if cut > 0: self.rvClonal(cut=cut) print('parsing finished,', num, 'reads parsed')
def fragSizeDis(self, minsize=10, maxsize=300): ''' Description: Calculate most probable size of DNA fragments from which a set of sequencing reads are sequenced. Parameters: minsize: minimal shift distance maxsize: maximal shift distance Value: An interger representing the most probable fragment size ''' cut = 1e-10 step = self.step minsize = functions.div(minsize, step) maxsize = functions.div(maxsize, step) + 1 avg = self.mean( ) * self.step ##### '*self.step' is added by Kaifu on Aug 1st, 2012 ##### ppois = r( '''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}''' ) lgpcut = 0 - functions.div(log(cut), log(10)) cut = int(avg + 0.5) while (0 - (functions.div(float(str(ppois(cut, avg.item())).split()[-1]), log(10))) < lgpcut): cut += 1 if cut < 1: cut = 1 print('calculating fragment size ...') dic = {} for i in range(int(minsize), int(maxsize)): dic[i] = 0 for chm in self.data: sz = self.data[chm]['+'].size - maxsize if sz <= 0: continue tchr = deepcopy(self.data[chm]) for stra in list( tchr.keys() ): #remove clonal reads, only necessary in danpos 2.2.0 and later versions tchr[ stra] -= cut #all positive values are count of clonal reads tchr[stra] = functions.div(((tchr[stra]**2)**0.5 + tchr[stra]), 2) #remove all neative values tchr[stra] = self.data[chm][stra] - tchr[stra] for i in range(int(minsize), int(maxsize)): c = tchr['+'][:int(sz)] * tchr['-'][i:(int(sz) + i)] dic[i] += c.sum() p = [] m = max(dic.values()) if m < 1: m = 1 print('sizes distribution:') for i in range(int(minsize), int(maxsize)): oline = "" for j in range( 0, int(functions.div(100 * dic[i], m)), ): oline += '-' print(oline, str(i * step) + 'bp', str(dic[i])) if dic[i] >= m * 0.95: p.append(i) warning = False for i in range(int(minsize), int(minsize) + 3): if i in p: warning = True for i in range(int(maxsize) - 3, int(maxsize)): if i in p: warning = True if warning: print( 'warning: the probilities of calculated size and up/bottom sizes are too close, we suggest to change up/bottom limit and try again!' ) upv, dpv = 0.0, 0.0 for j in p: upv += j * dic[j] dpv += dic[j] finalsize = int(functions.div(upv, dpv) + 0.5) if (finalsize - minsize) < 3: print( 'warning: the calculated fragment size seems too close the the bottom limit, we suggest to change bottom limit and try again!' ) if (maxsize - finalsize) < 3: print( 'warning: the calculated fragment size seems too close the the up limit, we to change up limit and try again!' ) print('potential size:', end=' ') for t in p: print(t * step, end=' ') print('') print('calculated fragment size:', finalsize * step) return finalsize * step
def test_div_float(self): self.assertEqual(div(4, 2.5), 1.6)
def test_divide_by_float(self): self.assertEqual(div(2, 0.5), 4) self.assertEqual(div(0.5, 4), 0.125) self.assertEqual(div(0.5, 0.5), 1)
def test_str_nymber(self): self.assertEqual(div('4', '2'), 2)
def test_divide_by_string(self): self.assertIsNone(div(2, "nic")) self.assertIsNone(div("cos", "nic")) self.assertIsNone(div("cos", 1))