Ejemplo n.º 1
0
 def rvClonal(self, cut=1e-10):
     '''
     Description:
         Remove clonal reads, not that this process may change the difference between samples.
         If dno't want to change the difference, transfer all samples to wigs and use the rvClonal function of the Wigs class.
         
     Parameter:
         cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
     
     Value:
         None
     '''
     ss = time()
     cut = float(cut)
     if cut <= 0: return
     print('removing clonal reads ...')
     avg = functions.div(
         self.mean(), self.step
     )  ##### '*self.step' is added by Kaifu on Aug 1st, 2012 ##### and is further changed to be /self.step on May29, 2014
     if cut > 0 and cut < 1:
         ppois = r(
             '''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}'''
         )
         lgpcut = 0 - functions.div(log(cut), log(10))
         cut = float(avg + 0.5)
         # removed this line below since rpy is not efficient
         #while(0-(functions.div(float(str(ppois(cut,float(avg))).split()[-1]),log(10)))<lgpcut):cut+=1
         pois_dist = functions.ppois(cut,
                                     float(avg),
                                     lower_tail=False,
                                     log_bool=True)
         while (0 - (functions.div(pois_dist, log(10))) < lgpcut):
             cut += 1
             pois_dist = functions.ppois(cut,
                                         float(avg),
                                         lower_tail=False,
                                         log_bool=True)
         if cut < 1: cut = 1
     print('whole genome average reads density is', avg, 'use cutoff:', cut)
     cut *= self.step  ##### added by Kaifu on May29, 2014
     cnum = 0
     rreads = 0
     treads = 0
     tchrv = numpy.array([0.0])
     before = self.sum()
     print('before removing:', before, 'reads')
     for chm in self.data:
         for stra in list(self.data[chm].keys()):
             tchrv = deepcopy(self.data[chm][stra])
             tchrv -= cut  #all positive values are count of clonal reads
             tchrv = functions.div(((tchrv**2)**0.5 + tchrv),
                                   2)  #remove all neative values
             self.data[chm][stra] -= tchrv
     after = self.sum()
     print('after removing:', after, 'reads')
     print(functions.div((before - after) * 100, before),
           'percent removed.')
     print('time cost:', time() - ss)
Ejemplo n.º 2
0
    def loadBam(self, file="", step=10, cut=1e-10):
        '''
        Description:
            load single-end reads data from a file in '.bam' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        '''
        print('\nparsing from Bam file', file, '...')

        oldchr = ""
        sizes = {}
        num = 0
        sam_reader = pysam.AlignmentFile(file, "rb")
        for sam in sam_reader.fetch():
            try:
                # skip unmapped reads
                if sam.is_unmapped:
                    continue
                name = sam.query_name
                chm = sam.reference_name
                stra = '+'
                start = int(sam.reference_start + 1)
                rlen = sam.reference_length
                # set `stra` to "-" for reverse reads
                if sam.is_reverse:
                    stra = '-'
                end, score = start + rlen, '1'
            except:
                print(name)  #'wrong line:',line[:-1]
                continue
            num += 1
            if num % 1000000 == 0: print(num, 'reads parsed')
            if stra == '+': mid = int(functions.div(start, step))
            elif stra == '-': mid = int(functions.div(end, step))
            if chm not in self.data:
                self.data[chm] = {
                    '+': numpy.array([0.0]),
                    '-': numpy.array([0.0])
                }
                sizes[chm] = 0
            if mid >= sizes[chm]:
                sizes[chm] = mid + 1000
                self.data[chm]['+'].resize(sizes[chm], refcheck=0)
                self.data[chm]['-'].resize(sizes[chm], refcheck=0)
            if mid >= 0: self.data[chm][stra][mid] += 1.0
        if num == 0: sys.exit("ERROR: Empty Bam File")
        self.clearEmptyEnd()
        for chm in self.data:
            lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size)
            self.data[chm]['+'].resize(lth, refcheck=0)
            self.data[chm]['-'].resize(lth, refcheck=0)
        if cut > 0: self.rvClonal(cut=cut)
        print('parsing finished,', num, 'reads parsed')
Ejemplo n.º 3
0
 def samplingNormalize(self):
     '''
     Description:
         Normalize between Wig class instances by sampling to same coverage
     Parameter:
         None
     Value:
         None
     '''
     from random import randint
     ss = time()
     wigs = self.data
     wsum = {}
     tarray = numpy.array([0.0])
     for wig in wigs:
         wsum[wig] = wigs[wig].sum()
     asum = functions.div(sum(wsum.values()), len(list(wsum.keys())))
     for k in wigs:
         wig = wigs[k]
         oldsum = wig.sum()
         num = oldsum - asum
         if num < 0: num = 0 - num
         else: num = asum
         for chr in wig.data:
             tarray.resize(int(wig.data[chr].sum()), refcheck=0)
             tarray, csz, i, tsz = tarray * 0, wig.data[chr].size, 0, 0
             while i < csz:
                 newtsz = int(tsz + wig.data[chr][i] + 0.5)
                 if newtsz >= tarray.size:
                     tarray.resize(newtsz + 1000, refcheck=0)
                 while tsz < newtsz:
                     tarray[tsz] = i
                     tsz += 1
                 i += 1
             i, tnum, tsz = 0, int(
                 functions.div(
                     (functions.div(num * wig.chrSum(chr), oldsum)),
                     wig.step)), tsz
             if oldsum > asum: wig.data[chr] *= 0
             while i < tnum:
                 i += 1
                 wig.data[chr][tarray[randint(0, tsz - 1)]] += 1
     sys.stdout.write('time cost' + str(time() - ss) + "\n")
     return 1
Ejemplo n.º 4
0
 def toWig(self, fs=None, extend=0, mifrsz=10, mafrsz=300):
     '''
     Description:
         Calculate nucleosome occupancy from the reads data
     
     Parameter:
         fs: average size of fragments that are subject to sequencing and generate the reads, only for signgle-end reads. When this value is not given, a fs value will be infered by the program. For paired-end reads loaded buy the function loadBedPaired(), set fs to 0.
         extend: a interger value, each read will be extend to this length.
         mifrsz: the minimal estimated average fragment size, only for single-end reads 
         mafrsz: the maximal estimated average fragment size, only for single-end reads
     
     Value: a Wig class instance
     '''
     step = self.step
     if fs == None: fs = self.fragSizeDis(minsize=mifrsz, maxsize=mafrsz)
     if extend <= 0: extend = fs
     print('extend to', extend)
     old_extend = extend
     fragsize, extend = functions.div(fs, (2 * step)), functions.div(
         extend, (2 * step))
     wg = Wig(step=step)
     print('generating wig ...')
     for chm in self.data:
         tmax = max(1000, fragsize * 4, extend * 4)
         if self.data[chm]['+'].size < tmax:
             self.data[chm]['+'].resize(tmax, refcheck=0)
         if self.data[chm]['-'].size < tmax:
             self.data[chm]['-'].resize(tmax, refcheck=0)
         wg.addChr(chm)
         lth = int(self.data[chm]['+'].size)
         wg.resizeChr(chm, lth * step)
         self.data[chm]['+'][fragsize:lth] = self.data[chm]['+'][0:(
             lth - fragsize)]
         for i in range(fragsize):
             self.data[chm]['+'][i] = 0
         self.data[chm]['+'][0:(
             lth - fragsize)] += self.data[chm]['-'][fragsize:lth]
         for p in range(-extend, extend + 1):
             wg.data[chm][extend:(lth - extend)] += self.data[chm]['+'][(
                 extend + p):(lth - extend + p)]
     wg.foldChange(functions.div(
         old_extend * 1.0, wg.step))  ##### added by Kaifu on May29, 2014
     return wg
Ejemplo n.º 5
0
def testDiv(operands):

    sys.path.append('project/src/calculate-activity')

    import functions as fun

    # invoke the div method for the functions.py file
    test = fun.div(operands[0], operands[1])

    # return the result
    return test
Ejemplo n.º 6
0
    def mean(self):
        '''
        Description:
            calculate the average reads count per nucleotide
        
        Parameter:
            None
        
        Value:
            None
        '''

        return functions.div(self.sum() * 1.0, self.size())
Ejemplo n.º 7
0
 def ajClonal(self,cut=1e-10,extend=1):  ###### add by Kaifu on Nov 14, 2012
     '''
     Description:
         Adjust clonal reads count, fold change between samples will not be altered in this process.
     Parameter:
         cut: the cutoff used to define clonal reads.
             When it is interger,  a read count larger than cut will be defined as clonal;
             when it is float, a read count that is larger than mean count by a Poisson test P value < cut will be defined as clonal.
         fsz: the extension length of each read that is used to calculate the wiggle file. Extension length means the length from 5' end to 3' end,
             e.g. a read may be 36bp when it is generated by the sequencing machine, but it might have been extended to be 80bp or cutted to be 1 bp,
             so the extension length will then be 80bp or 1bp.
     Value:
         None
     Note:
         all wiggle file in a Wigs object must have the same step size.
     '''
     
     #sys.stdout.write '\nremoving clonal singal ...'
     ks=list(self.keys())
     m=deepcopy(self.get(ks[0]))
     if len(ks)>1:
         for k in ks[1:]:m.add(self.get(k))
     m.foldChange(functions.div(1.0,len(ks)))
     avg=m.mean()
     from math import log10,log
     if cut=='0':return
     else:
         if float(cut)>=1:cut=float(cut)
         else:
             co=cut.split('-')
             if len(co)==2:cut=float(co[1])-log10(float(co[0][:-1]))
             else:cut=0-log10(float(co[0]))
             lgpcut=cut
             cut=int(avg+0.5)
             ppois=r('''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}''')
             while(0-(functions.div(float(str(ppois(functions.div(cut*1.0,extend),functions.div(avg*1.0,extend))).split()[-1]),log(10)))<lgpcut):cut+=1
     sys.stdout.write('aveage density is ' + str(avg) + ', use clonal signal cutoff ' + str(cut) +  "\n")
     ks=list(self.keys())
     for chr in m.getChrs():
         tchrv=deepcopy(m.data[chr])
         tchrv-=cut#all positive values are count of clonal reads
         tchrv=functions.div(((tchrv**2)**0.5+tchrv),2)#remove all neative values
         tchrv=m.data[chr]-tchrv+numpy.log(tchrv+1)# the addition of '1' is to avoid log(0),"+numpy.log(tchrv+1)" is used to keep the rank order values
         sys.stdout.write(chr+":")
         for k in ks:
             twg=self.get(k)
             if chr not in twg.data:continue
             temp=twg.data[chr].sum()
             sys.stdout.write('\t'+str(k),'reduced from',temp,'to ')
             if chr not in twg.data:twg.data[chr]=numpy.array([0.0])
             if twg.data[chr].size!=tchrv.size:twg.data[chr].resize(tchrv.size,refcheck=0)
             twg.data[chr]=functions.div(tchrv*twg.data[chr],(m.data[chr]+1e-100)) # the addition of '1e-100' is to avoid devide by 0
             sys.stdout.write(twg.data[chr].sum()+ ', percent removed: '+str(100-functions.div(twg.data[chr].sum()*100.0,temp))+"\n")
Ejemplo n.º 8
0
 def autocorrelation(self, ofile, minsize=10, maxsize=300):
     '''
     Description:
         Calculate correlation between the two strands allowing shift distances.
         
     Parameters:
         ofile: the output file used to write the result to
         minsize: minimal shift distance
         maxsize: maximal shift distance
     
     Value:
         an dictionary with shift distance as key and correlation coefficient  correspondint to the distance
     '''
     step = self.step
     minsize = functions.div(minsize, step)
     maxsize = functions.div(maxsize, step) + 1
     print('calculating ...')
     dic = {}
     for i in range(int(minsize), int(maxsize)):
         dic[i] = 0
         szsum = 0
         for chm in self.data:
             sz = self.data[chm]['+'].size - maxsize
             c = self.data[chm]['+'][:sz] * self.data[chm]['-'][i:(sz + i)]
             dic[i] += sz * functions.div(
                 (c.mean() - self.data[chm]['+'][:sz].mean() *
                  self.data[chm]['-'][i:(sz + i)].mean()),
                 (self.data[chm]['+'][:sz].std() *
                  self.data[chm]['-'][i:(sz + i)].std()))
             szsum += sz
         dic[i] = functions.div(dic[i], (szsum * 1.0))
     ks = list(dic.keys())
     ks.sort()
     fo = open(ofile, 'w')
     for k in ks:
         fo.write(str(k) + '\t' + str(dic[k]) + '\n')
     return dic
Ejemplo n.º 9
0
def testDiv(x, y):

	#get Directory for functions
	currentworkingdirectory = os.getcwd()
	currentworkingdirectory = currentworkingdirectory.replace('/testCasesExecutables', '')
	currentworkingdirectory = (currentworkingdirectory + '/project/src')
	sys.path.insert(0, currentworkingdirectory)

	from functions import div

	try:
		output = div(int(x),int(y))
	except:
		output = "ERROR"

	return output
Ejemplo n.º 10
0
 def foldNormalize(self,scalepairs=None,sampling_total=None,nonzero=False):
     '''
     Description:
         Normalize between Wig class instances by fold change
     Parameter:
         None
     Value:
         None
     '''
     ss=time()
     wigs=self.data
     names=list(wigs.keys())
     names.sort()
     
     if sampling_total==None:
         wsum={}
         for wig in wigs:wsum[wig]=wigs[wig].sum()
         asum=functions.div(sum(wsum.values()),len(list(wsum.keys())))
         for wig in names:
             sys.stdout.write(wig + ' from ' + str(wigs[wig].sum()) + ' to ')
             if scalepairs==None:wigs[wig].foldChange(functions.div(asum*1.0,wsum[wig]))
             else:wigs[wig].foldChange(functions.div(scalepairs[wig]*1.0,wsum[wig]))
             sys.stdout.write(str(wigs[wig].sum()) + "\n")
     else:
         average_total=functions.div(sum(sampling_total.values()),len(list(sampling_total.keys())))
         for name in names:
             sys.stdout.write(name + ' from ' + str(wigs[name].sum()) + ' to ')
             if scalepairs==None:wigs[name].foldChange(functions.div(average_total*1.0,sampling_total[name]))
             else:
                 wigs[name].foldChange(functions.div(scalepairs[name],sampling_total[name]))
             sys.stdout.write(str(wigs[name].sum()) + "\n")
     if nonzero:
         sys.stdout.write('further correction based on count of non-zero base pairs\n')
         gsizes,non0sizes={},{}
         for wig in wigs:
             gsizes[wig]=wigs[wig].gsize()
             non0sizes[wig]=wigs[wig].non0size()
         agsize=functions.div(sum(gsizes.values())*1.0,len(list(gsizes.keys())))
         for wig in wigs:
             sys.stdout.write(wig + ' from ' + str(wigs[wig].sum()) + ' to ')
             wigs[wig].foldChange(functions.div(non0sizes[wig],agsize))
             sys.stdout.write(str(wigs[wig].sum()) +
                 'based on non0size' + non0sizes[wig] + 'and genome size' 
                 + agsize + "\n")
     return 1
Ejemplo n.º 11
0
def refquantile(paths,ofile,gfile):
    print('\nPreparing reference ...')
    tm=time()
    files=paths.split(':')
    fi={}
    fo=open(ofile,'w')
    for file in files:fi[file]=open(file)
    nfile=len(files)
    for line in fi[files[0]]:
        col=line.split()
        v=float(col[0])
        for file in files[1:]:
            add_line=fi[file].readline()
            v+=float(add_line.split()[0])
        fo.write(str(div(v,nfile))+'\t-\t-\t-\n')
    fo.close()
    print('time cost:',time()-tm)
Ejemplo n.º 12
0
def changevalue(ifile,ref,ofile,gfile,step=10,suppress=False,buffer=None):
    from random import randint
    if ifile!=ref:print('\nnormalizing',ifile,'...')
    else:print('\nsaving reference ...')
    tm=time()
    fi,fr=open(ifile),open(ref)
    wg=Wig(step=step,gfile=gfile)
    for line in fi:
        col=line.split()
        rcol=fr.readline().split()
        if len(rcol)==0:rcol=[0.0]
        cr,pos,vl=col[2],div(int(col[3]),step),float(rcol[0])
        wg.data[cr][pos]=vl
    n=0
    for line in fr:n+=1
    if n>0:print('Warning: the input genome size is smaller than the reference genome size by',n,'wiggle steps!')
    wg.save(ofile,suppress=suppress)
    print('time cost:',time()-tm)
Ejemplo n.º 13
0
    def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False):
        '''
        Description:
            caculate the sum of each wig's values after excluding the low and high percentile
        Parameter:
            None
        Value:
            None
        '''
        
        #if exclude_low_percent==0 and exclude_high_percent==0:return None
        #else:
        #print 'calculating normalization factors by sampling ...'
        names=list(self.data.keys())
        if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None
        sampling_total={}
        if region_file==None:
            sys.stdout.write('calculate total signal in each sample after excluding the top ' +
                str(exclude_high_percent) + ' and bottom ' + str(exclude_low_percent) + 
                'percents of genomic regions with extremely high and low signal values\n')
            wsums={}
            for name in names:wsums[name]=self.data[name].sum()
            wavg=functions.div(sum(wsums.values()),len(list(wsums.values())))
            
            rfwig=deepcopy(self.data[names[0]])
            rfwig.foldChange(functions.div(wavg*1.0,wsums[names[0]]))
            for name in names[1:]:
                self.data[name].foldChange(functions.div(wavg*1.0,wsums[name]))
                rfwig.add(self.data[name])
                self.data[name].foldChange(functions.div(wsums[name]*1.0,wavg))
                
            rfwig.foldChange(functions.div(1.0,len(names)))
            lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero)
            rg=rfwig.regionWithinValueRange(lowcut,highcut)
            if region_out_file!=None:rg.save(region_out_file)
        else:
            sys.stdout.write('calculate total signal in each sample in genomic regions defined by' + region_file + "\n")
            rg=Wig(region_file)
        for name in names:sampling_total[name]=self.data[name].multiply(rg).sum()
        sys.stdout.write(str(rg.sum()) + ' (' + str(functions.div(rg.sum()*100.0,rg.gsize())) + 
            ' %) of ' + str(rg.gsize()) + ' base pairs calculated:\n')
        for name in names:
            sys.stdout.write(name + str(sampling_total[name]) + ' (' + 
                str(functions.div(sampling_total[name]*100.0,self.data[name].sum())) + '% of total)\n')

        return sampling_total
Ejemplo n.º 14
0
 def sizeAdjust(self, gfile):
     '''
     Description:
         Adjust the size of each chrosome.
     
     Parameter:
         gfile: path to the file containing the size of each chrosome, each line in the file would be in the format "chrosome_name size", in which size is an integer value, and chrosome_name should contain no empty space
     
     value:
         None.
     '''
     sizes = {}
     for line in open(gfile):
         col = line.split()
         sizes[col[0]] = functions.div(int(col[1]), self.step)
     for chm in self.data:
         if chm not in sizes: self.data.pop(chm)
         for str in self.data[chm]:
             self.data[chm][str].resize(sizes[chm], refcheck=0)
Ejemplo n.º 15
0
 def test_div_four_by_two(self):
     self.assertEqual(div('4', '2'), 2)
Ejemplo n.º 16
0
 def test_divide_by_zero(self):
     self.assertIsNone(div(1, 0))
Ejemplo n.º 17
0
    def loadBamPaired(self, file="", step=10, cut=1e-10):
        '''
        Description:
            load paired-end reads data from a file in '.bam' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        '''
        print('\nparsing from bam file', file, '...')

        oldchr = ""
        sizes = {}
        fragsizes = {}
        num = 0
        sam_reader = pysam.AlignmentFile(file, "rb")
        for sam in sam_reader.fetch():
            try:
                if not sam.is_proper_pair:
                    continue
                ref_start = int(sam.reference_start + 1)
                next_ref_start = int(sam.next_reference_start + 1)
                rlen = sam.reference_length
                name = sam.query_name
                chm = sam.reference_name
                stra = '+'
                score = '1'
                mid = int(
                    functions.div((ref_start + next_ref_start + rlen),
                                  (2 * step)))
                if sam.is_reverse: stra = '-'
                if next_ref_start > ref_start:
                    fragsize = next_ref_start - ref_start + rlen
                else:
                    fragsize = ref_start - next_ref_start + rlen
                if fragsize not in fragsizes: fragsizes[fragsize] = 1
                else: fragsizes[fragsize] += 1
            except:
                print(sam)  #'wrong line:',line[:-1]
                sys.exit("SOME REASON DID NOT WORK...")
                continue
            num += 1
            if num % 1000000 == 0: print(num, 'reads parsed')
            if chm not in self.data:
                self.data[chm] = {
                    '+': numpy.array([0.0]),
                    '-': numpy.array([0.0])
                }
                sizes[chm] = 0
            if mid >= sizes[chm]:
                sizes[chm] = mid + 1000
                self.data[chm]['+'].resize(sizes[chm], refcheck=0)
                self.data[chm]['-'].resize(sizes[chm], refcheck=0)
            if mid >= 0: self.data[chm][stra][mid] += 1.0
        if num == 0: sys.exit("ERROR: Empty Bam File")
        self.clearEmptyEnd()
        for chm in self.data:
            lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size)
            self.data[chm]['+'].resize(lth, refcheck=0)
            self.data[chm]['-'].resize(lth, refcheck=0)
        if float(cut) > 0: self.rvClonal(cut=cut)
        print('parsing finished,', num, 'reads parsed')
        maxv, lths = max(fragsizes.values()), list(fragsizes.keys())
        lths.sort()
        tlth, count = 0, 0
        maxlth = []
        for lth in lths:
            if fragsizes[lth] == maxv: maxlth.append(lth)
            tlth += lth * fragsizes[lth]
            count += fragsizes[lth]
            dcount = int(np.ceil(functions.div(100 * fragsizes[lth], maxv)))
            if dcount > 4: print('-' * int(dcount), lth, fragsizes[lth])
        print('average fragment size:', functions.div(tlth * 1.0, count))
        print('most enriched fragment size:', maxlth)
Ejemplo n.º 18
0
from functions import add, sub, mult, div

x = add(8, 9)
print(x)

x = sub(x, 3)
print(x)

x = mult(x, 4)
print(x)

x = div(x, 2)
print(x)

Ejemplo n.º 19
0
    def loadBedPaired(self, file="", step=10, cut=1e-10):
        '''
        Description:
            load paired-end reads data from a file in '.bed' format, the "name" 
            fields of each reads pair must be the same except for the last one 
            character in each field, which should be either '1' or '2', e.g. 
            'reads/1' or 'reads/2', each pair of reads must be arranged in two 
            neighboring lines.
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        '''
        print('\nparsing from bed file', file, '...')

        oldchr = ""
        sizes = {}
        fragsizes = {}
        num = 0
        serr, nerr = 0, 0
        infile = open(file)
        if file[-2:] == 'gz': infile = gzip.open(file)
        end1, end2 = [], []
        for line in infile:
            try:
                col = line[:-1].split('\t')
                tnames = col[3].split()
                if len(tnames) == 2:
                    col[3] = tnames[0] + col[3][
                        -1]  ########## add by kaifu on sep 5, 2012 ########## some time the reads name have two words seperated by a space (not '\t')
                col[1], col[2] = int(col[1]), int(col[2])
                if len(end1) < 1:
                    end1 = col
                    num += 1
                    if num % 1000000 == 0: print(num, 'reads parsed')
                    continue
                else:
                    end2 = col
                    num += 1
                    if num % 1000000 == 0: print(num, 'reads parsed')
            except:
                print('wrong line:', line[:-1])
                continue
            if end1[3][:-1] == end2[3][:-1]:
                if end1[5] == '+' and end2[5] == '-':
                    chm, mid, fragsize = end1[0], int(
                        functions.div((end1[1] + end2[2]),
                                      (2 * step))), end2[2] - end1[1]
                elif end1[5] == '-' and end2[5] == '+':
                    chm, mid, fragsize = end1[0], int(
                        functions.div((end1[2] + end2[1]),
                                      (2 * step))), end2[1] - end1[2]
                else:
                    #print 'pair error --- reads from same strand:\n',end1,'\n',end2,'\n'
                    serr += 1
                    end1, end2 = [], []
                    continue
                if fragsize not in fragsizes: fragsizes[fragsize] = 1
                else: fragsizes[fragsize] += 1
            else:
                #print 'pair error --- single end reads:\n',end1,'\n'
                nerr += 1
                end1 = end2
                end2 = []
                continue

            if chm not in self.data:
                self.data[chm] = {
                    '+': numpy.array([0.0]),
                    '-': numpy.array([0.0])
                }
                sizes[chm] = 0
            if mid >= sizes[chm]:
                sizes[chm] = mid + 1000
                self.data[chm]['+'].resize(sizes[chm], refcheck=0)
                self.data[chm]['-'].resize(sizes[chm], refcheck=0)
            if mid > 0:
                self.data[chm]['+'][mid] += 1.0
                self.data[chm]['-'][mid] += 1.0
            end1, end2 = [], []
        self.clearEmptyEnd()
        for chm in self.data:
            lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size)
            self.data[chm]['+'].resize(lth, refcheck=0)
            self.data[chm]['-'].resize(lth, refcheck=0)
        print('parsing finished,', num, 'reads parsed')
        maxv, lths = max(fragsizes.values()), list(fragsizes.keys())
        lths.sort()
        tlth, count = 0, 0
        maxlth = []
        for lth in lths:
            if fragsizes[lth] == maxv: maxlth.append(lth)
            tlth += lth * fragsizes[lth]
            count += fragsizes[lth]
            dcount = np.ceil(functions.div(100 * fragsizes[lth], maxv))
            print(dcount)
            if dcount > 4: print('-' * int(dcount), lth, fragsizes[lth])
        print('average fragment size:', functions.div(tlth * 1.0, count))
        print('most enriched fragment size:', maxlth)
        print(serr, 'pairs failed due to locations on same strands')
        print(nerr, 'reads have no mate reads')
        if cut > 0: self.rvClonal(cut=cut)
Ejemplo n.º 20
0
    def loadBed(self, file="", step=10, cut=1e-10):  #,loadcount=0):
        '''
        Description:
            load single-end reads data from a file in '.bed' format
        
        parameter:
            file: a path to the file containing the sequencing reads
            step: each chrosome will present as an vector, each value in the vector represent a short region of the chrosome, the step value is the size of the short region
            cut: the cutoff for removing clonal reads, could be P value larger than 0 and small than 1, or count as a positive integer.
        Value:
            None
        '''
        print('\nparsing from bed file', file, '...')

        oldchr = ""
        sizes = {}
        num = 0
        infile = open(file)
        if file[-2:] == 'gz': infile = gzip.open(file)
        for line in infile:
            try:
                col = line.split()
                #if len(col)==5:chm,start,end,name,stra=col[0:5]
                #else:
                chm, start, end, name, score, stra = col[0:6]
                start, end = int(start), int(end)
            except:
                print('wrong format:', line.split())  #'wrong line:',line[:-1]
                continue
            #if col[0]!='chr1':continue ################################################ just for test ################################################
            num += 1
            '''
            if loadcount>0:
                if num>loadcount:
                    self.clearEmptyEnd()
                    for chm in self.data:
                        lth=max(self.data[chm]['+'].size,self.data[chm]['-'].size)
                        self.data[chm]['+'].resize(lth,refcheck=0)
                        self.data[chm]['-'].resize(lth,refcheck=0)
                    if cut>0:self.rvClonal(cut=cut)
                    print 'parsing finished,',num-1,'reads parsed'
                    return
            '''
            if num % 1000000 == 0: print(num, 'reads parsed')
            if stra == '+': mid = int(functions.div(start, step))
            elif stra == '-': mid = int(functions.div(end, step))
            if chm not in self.data:
                self.data[chm] = {
                    '+': numpy.array([0.0]),
                    '-': numpy.array([0.0])
                }
                sizes[chm] = 0
            if mid >= sizes[chm]:
                sizes[chm] = mid + 1000
                self.data[chm]['+'].resize(sizes[chm], refcheck=0)
                self.data[chm]['-'].resize(sizes[chm], refcheck=0)
            if mid >= 0: self.data[chm][stra][mid] += 1.0
        self.clearEmptyEnd()
        for chm in self.data:
            lth = max(self.data[chm]['+'].size, self.data[chm]['-'].size)
            self.data[chm]['+'].resize(lth, refcheck=0)
            self.data[chm]['-'].resize(lth, refcheck=0)
        if cut > 0: self.rvClonal(cut=cut)
        print('parsing finished,', num, 'reads parsed')
Ejemplo n.º 21
0
    def fragSizeDis(self, minsize=10, maxsize=300):
        '''
        Description:
            Calculate most probable size of DNA fragments from which a set of sequencing reads are sequenced.
            
        Parameters:
            minsize: minimal shift distance
            maxsize: maximal shift distance
        
        Value:
            An interger representing the most probable fragment size
            
        '''
        cut = 1e-10
        step = self.step
        minsize = functions.div(minsize, step)
        maxsize = functions.div(maxsize, step) + 1

        avg = self.mean(
        ) * self.step  ##### '*self.step' is added by Kaifu on Aug 1st, 2012 #####
        ppois = r(
            '''function(q,avg){return(ppois(q,avg,lower.tail=FALSE,log=TRUE))}'''
        )

        lgpcut = 0 - functions.div(log(cut), log(10))
        cut = int(avg + 0.5)
        while (0 -
               (functions.div(float(str(ppois(cut, avg.item())).split()[-1]),
                              log(10))) < lgpcut):
            cut += 1
        if cut < 1: cut = 1

        print('calculating fragment size ...')
        dic = {}
        for i in range(int(minsize), int(maxsize)):
            dic[i] = 0
        for chm in self.data:
            sz = self.data[chm]['+'].size - maxsize
            if sz <= 0: continue
            tchr = deepcopy(self.data[chm])

            for stra in list(
                    tchr.keys()
            ):  #remove clonal reads, only necessary in danpos 2.2.0 and later versions
                tchr[
                    stra] -= cut  #all positive values are count of clonal reads
                tchr[stra] = functions.div(((tchr[stra]**2)**0.5 + tchr[stra]),
                                           2)  #remove all neative values
                tchr[stra] = self.data[chm][stra] - tchr[stra]

            for i in range(int(minsize), int(maxsize)):
                c = tchr['+'][:int(sz)] * tchr['-'][i:(int(sz) + i)]
                dic[i] += c.sum()
        p = []
        m = max(dic.values())
        if m < 1: m = 1
        print('sizes distribution:')
        for i in range(int(minsize), int(maxsize)):
            oline = ""
            for j in range(
                    0,
                    int(functions.div(100 * dic[i], m)),
            ):
                oline += '-'
            print(oline, str(i * step) + 'bp', str(dic[i]))
            if dic[i] >= m * 0.95: p.append(i)
        warning = False
        for i in range(int(minsize), int(minsize) + 3):
            if i in p: warning = True
        for i in range(int(maxsize) - 3, int(maxsize)):
            if i in p: warning = True
        if warning:
            print(
                'warning: the probilities of calculated size and up/bottom sizes are too close, we suggest to change up/bottom limit and try again!'
            )
        upv, dpv = 0.0, 0.0
        for j in p:
            upv += j * dic[j]
            dpv += dic[j]
        finalsize = int(functions.div(upv, dpv) + 0.5)
        if (finalsize - minsize) < 3:
            print(
                'warning: the calculated fragment size seems too close the the bottom limit, we suggest to change bottom limit and try again!'
            )
        if (maxsize - finalsize) < 3:
            print(
                'warning: the calculated fragment size seems too close the the up limit, we to change up limit and try again!'
            )
        print('potential size:', end=' ')
        for t in p:
            print(t * step, end=' ')
        print('')
        print('calculated fragment size:', finalsize * step)
        return finalsize * step
Ejemplo n.º 22
0
 def test_div_float(self):
     self.assertEqual(div(4, 2.5), 1.6)
Ejemplo n.º 23
0
 def test_divide_by_float(self):
     self.assertEqual(div(2, 0.5), 4)
     self.assertEqual(div(0.5, 4), 0.125)
     self.assertEqual(div(0.5, 0.5), 1)
Ejemplo n.º 24
0
 def test_str_nymber(self):
     self.assertEqual(div('4', '2'), 2)
Ejemplo n.º 25
0
 def test_divide_by_string(self):
     self.assertIsNone(div(2, "nic"))
     self.assertIsNone(div("cos", "nic"))
     self.assertIsNone(div("cos", 1))