Ejemplo n.º 1
0
Archivo: wiq.py Proyecto: cschu/DANPOS3
def changevalue(ifile,
                ref,
                ofile,
                gfile,
                step=10,
                suppress=False,
                buffer=None):
    from random import randint
    if ifile != ref: print('\nnormalizing', ifile, '...')
    else: print('\nsaving reference ...')
    tm = time()
    fi, fr = open(ifile), open(ref)
    wg = Wig(step=step, gfile=gfile)
    for line in fi:
        col = line.split()
        rcol = fr.readline().split()
        if len(rcol) == 0: rcol = [0.0]
        cr, pos, vl = col[2], int(col[3]) / step, float(rcol[0])
        wg.data[cr][pos] = vl
    n = 0
    for line in fr:
        n += 1
    if n > 0:
        print(
            'Warning: the input genome size is smaller than the reference genome size by',
            n, 'wiggle steps!')
    wg.save(ofile, suppress=suppress)
    print('time cost:', time() - tm)
Ejemplo n.º 2
0
 def load(self, path, suppress=False):
     '''
     Description:
         Load multiple Wig class instances from wiggle format files located in one directory
     Parameter:
         path: a path to the directory that contain the wiggle format files
     Value:
         None
     '''
     paths = path
     for path in paths.split(','):
         #wigs={}
         if os.path.isdir(path):
             for infile in glob.glob(os.path.join(path, '*.wig')):
                 fname = os.path.split(infile)[-1]
                 if fname[-4:] == '.wig': fname = fname[:-4]
                 self.set(
                     fname, Wig(infile, step=self.step, suppress=suppress)
                 )  ########## ---add--- by kaifu on Aug 15,2012 ##########
                 #wigs[infile]=Wig(infile,step=self.step) ########## ---delete--- by kaifu on Aug 15,2012 ##########
         elif os.path.isfile(path):
             fname = os.path.split(path)[-1]
             if fname[-4:] == '.wig': fname = fname[:-4]
             self.set(
                 fname, Wig(path, step=self.step, suppress=suppress)
             )  ########## ---add--- by kaifu on Aug 15,2012 ##########
Ejemplo n.º 3
0
def rawsort(ifile,sort_ofile,gfile,format,step=10,suppress=False,buffer=None):
    tm=time()
    if format=='wig':
        print '\nconverting',ifile,'...'
        raw_ofile=sort_ofile[:-3]+'raw.wiq'
        wg=Wig(file=ifile,gfile=gfile,step=step,suppress=suppress)
        wg.ajust_size(gfile=gfile)
        wg.save(file=raw_ofile,format="wiq",step=step,suppress=suppress)
        print 'time cost:',time()-tm
        tm=time()
    else:raw_ofile=ifile
    
    print '\nsorting',raw_ofile,'...'
    temp=ifile[:-3]+'temp'
    while os.path.isdir(temp):temp=temp+'.temp'
    os.mkdir(temp)
    if buffer!=None:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --buffer-size '+str(buffer)+' --temporary-directory '+str(temp)+' '+raw_ofile
    else:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --temporary-directory '+str(temp)+' '+raw_ofile
    os.system(cmd)
    if format=='wig':
        print 'Removing ',raw_ofile,'...'
        os.system('rm '+raw_ofile)
    print 'removing',temp
    os.system('rm '+str(temp)+' -r')
    print 'time cost:',time()-tm
Ejemplo n.º 4
0
 def toWig(self,fs=None,extend=0,mifrsz=10,mafrsz=300):
     '''
     Description:
         Calculate nucleosome occupancy from the reads data
     
     Parameter:
         fs: average size of fragments that are subject to sequencing and generate the reads, only for signgle-end reads. When this value is not given, a fs value will be infered by the program. For paired-end reads loaded buy the function loadBedPaired(), set fs to 0.
         extend: a interger value, each read will be extend to this length.
         mifrsz: the minimal estimated average fragment size, only for single-end reads 
         mafrsz: the maximal estimated average fragment size, only for single-end reads
     
     Value: a Wig class instance
     '''
     step=self.step
     if fs==None:fs=self.fragSizeDis(minsize=mifrsz,maxsize=mafrsz)
     if extend<=0:extend=fs
     print 'extend to',extend
     old_extend=extend
     fragsize,extend=fs/(2*step),extend/(2*step)
     wg=Wig(step=step)
     print 'generating wig ...'
     for chr in self.data:
         tmax=max(1000,fragsize*4,extend*4)
         if self.data[chr]['+'].size<tmax:self.data[chr]['+'].resize(tmax,refcheck=0)
         if self.data[chr]['-'].size<tmax:self.data[chr]['-'].resize(tmax,refcheck=0)
         wg.addChr(chr)
         lth=self.data[chr]['+'].size
         wg.resizeChr(chr,lth*step)
         self.data[chr]['+'][fragsize:lth]=self.data[chr]['+'][0:(lth-fragsize)]
         for i in range(fragsize):self.data[chr]['+'][i]=0
         self.data[chr]['+'][0:(lth-fragsize)]+=self.data[chr]['-'][fragsize:lth]
         for p in range(-extend,extend+1):wg.data[chr][extend:(lth-extend)]+=self.data[chr]['+'][(extend+p):(lth-extend+p)]
     wg.foldChange(old_extend*1.0/wg.step) ##### added by Kaifu on May29, 2014
     return wg
Ejemplo n.º 5
0
def changevalue(ifile,ref,ofile,gfile,step=10,suppress=False,buffer=None):
    from random import randint
    if ifile!=ref:print '\nnormalizing',ifile,'...'
    else:print '\nsaving reference ...'
    tm=time()
    fi,fr=open(ifile),open(ref)
    wg=Wig(step=step,gfile=gfile)
    for line in fi:
        col=line.split()
        rcol=fr.readline().split()
        if len(rcol)==0:rcol=[0.0]
        cr,pos,vl=col[2],int(col[3])/step,float(rcol[0])
        wg.data[cr][pos]=vl
    n=0
    for line in fr:n+=1
    if n>0:print 'Warning: the input genome size is smaller than the reference genome size by',n,'wiggle steps!'
    wg.save(ofile,suppress=suppress)
    print 'time cost:',time()-tm
Ejemplo n.º 6
0
    def samplingTotal(self,
                      region_file=None,
                      region_out_file=None,
                      exclude_low_percent=1,
                      exclude_high_percent=1,
                      bnum=100000,
                      nonzero=False):
        '''
        Description:
            caculate the sum of each wig's values after excluding the low and high percentile
        Parameter:
            None
        Value:
            None
        '''

        #if exclude_low_percent==0 and exclude_high_percent==0:return None
        #else:
        #print 'calculating normalization factors by sampling ...'
        names = self.data.keys()
        if exclude_low_percent == 0 and exclude_high_percent == 0 and region_file == None:
            return None
        sampling_total = {}
        if region_file == None:
            print 'calculate total signal in each sample after excluding the top', exclude_high_percent, 'and bottom', exclude_low_percent, 'percents of genomic regions with extremely high and low signal values'
            wsums = {}
            for name in names:
                wsums[name] = self.data[name].sum()
            wavg = sum(wsums.values()) / len(wsums.values())

            rfwig = deepcopy(self.data[names[0]])
            rfwig.foldChange(wavg * 1.0 / wsums[names[0]])
            for name in names[1:]:
                self.data[name].foldChange(wavg * 1.0 / wsums[name])
                rfwig.add(self.data[name])
                self.data[name].foldChange(wsums[name] * 1.0 / wavg)

            rfwig.foldChange(1.0 / len(names))
            lowcut, highcut = rfwig.percentile(
                p=[exclude_low_percent, 100 - exclude_high_percent],
                bnum=bnum,
                nonzero_end=nonzero)
            rg = rfwig.regionWithinValueRange(lowcut, highcut)
            if region_out_file != None: rg.save(region_out_file)
        else:
            print 'calculate total signal in each sample in genomic regions defined by', region_file
            rg = Wig(region_file)
        for name in names:
            sampling_total[name] = self.data[name].multiply(rg).sum()
        print rg.sum(), '(' + str(
            rg.sum() * 100.0 /
            rg.gsize()) + '%) of', rg.gsize(), 'base pairs calculated:'
        for name in names:
            print name, sampling_total[name], '(' + str(
                sampling_total[name] * 100.0 /
                self.data[name].sum()) + '% of total)'

        return sampling_total
Ejemplo n.º 7
0
def rawsort(ifile,sort_ofile,gfile,format,step=10,suppress=False,buffer=None):
    tm=time()
    if format=='wig':
        print('\nconverting',ifile,'...')
        raw_ofile=sort_ofile[:-3]+'raw.wiq'
        wg=Wig(file=ifile,gfile=gfile,step=step,suppress=suppress)
        wg.ajust_size(gfile=gfile)
        wg.save(file=raw_ofile,format="wiq",step=step,suppress=suppress)
        print('time cost:',time()-tm)
        tm=time()
    else:raw_ofile=ifile
    
    print('\nsorting',raw_ofile,'...')
    temp=ifile[:-3]+'temp'
    while os.path.isdir(temp):temp=temp+'.temp'
    os.mkdir(temp)
    if buffer!=None:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --buffer-size '+str(buffer)+' --temporary-directory '+str(temp)+' '+raw_ofile
    else:cmd='sort -r -n -s -k1 -k2 -o '+sort_ofile+' --temporary-directory '+str(temp)+' '+raw_ofile
    os.system(cmd)
    if format=='wig':
        print('Removing ',raw_ofile,'...')
        os.system('rm '+raw_ofile)
    print('removing',temp)
    os.system('rm '+str(temp)+' -r')
    print('time cost:',time()-tm)
Ejemplo n.º 8
0
    def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False):
        '''
        Description:
            caculate the sum of each wig's values after excluding the low and high percentile
        Parameter:
            None
        Value:
            None
        '''
        
        #if exclude_low_percent==0 and exclude_high_percent==0:return None
        #else:
        #print 'calculating normalization factors by sampling ...'
        names=list(self.data.keys())
        if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None
        sampling_total={}
        if region_file==None:
            sys.stdout.write('calculate total signal in each sample after excluding the top ' +
                str(exclude_high_percent) + ' and bottom ' + str(exclude_low_percent) + 
                'percents of genomic regions with extremely high and low signal values\n')
            wsums={}
            for name in names:wsums[name]=self.data[name].sum()
            wavg=functions.div(sum(wsums.values()),len(list(wsums.values())))
            
            rfwig=deepcopy(self.data[names[0]])
            rfwig.foldChange(functions.div(wavg*1.0,wsums[names[0]]))
            for name in names[1:]:
                self.data[name].foldChange(functions.div(wavg*1.0,wsums[name]))
                rfwig.add(self.data[name])
                self.data[name].foldChange(functions.div(wsums[name]*1.0,wavg))
                
            rfwig.foldChange(functions.div(1.0,len(names)))
            lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero)
            rg=rfwig.regionWithinValueRange(lowcut,highcut)
            if region_out_file!=None:rg.save(region_out_file)
        else:
            sys.stdout.write('calculate total signal in each sample in genomic regions defined by' + region_file + "\n")
            rg=Wig(region_file)
        for name in names:sampling_total[name]=self.data[name].multiply(rg).sum()
        sys.stdout.write(str(rg.sum()) + ' (' + str(functions.div(rg.sum()*100.0,rg.gsize())) + 
            ' %) of ' + str(rg.gsize()) + ' base pairs calculated:\n')
        for name in names:
            sys.stdout.write(name + str(sampling_total[name]) + ' (' + 
                str(functions.div(sampling_total[name]*100.0,self.data[name].sum())) + '% of total)\n')

        return sampling_total
Ejemplo n.º 9
0
    def samplingTotal(self,region_file=None,region_out_file=None,exclude_low_percent=1,exclude_high_percent=1,bnum=100000,nonzero=False):
        '''
        Description:
            caculate the sum of each wig's values after excluding the low and high percentile
        Parameter:
            None
        Value:
            None
        '''
        
        #if exclude_low_percent==0 and exclude_high_percent==0:return None
        #else:
        #print 'calculating normalization factors by sampling ...'
        names=self.data.keys()
        if exclude_low_percent==0 and exclude_high_percent==0 and region_file==None: return None
        sampling_total={}
        if region_file==None:
            print 'calculate total signal in each sample after excluding the top',exclude_high_percent,'and bottom',exclude_low_percent,'percents of genomic regions with extremely high and low signal values'
            wsums={}
            for name in names:wsums[name]=self.data[name].sum()
            wavg=sum(wsums.values())/len(wsums.values())
            
            rfwig=deepcopy(self.data[names[0]])
            rfwig.foldChange(wavg*1.0/wsums[names[0]])
            for name in names[1:]:
                self.data[name].foldChange(wavg*1.0/wsums[name])
                rfwig.add(self.data[name])
                self.data[name].foldChange(wsums[name]*1.0/wavg)
                
            rfwig.foldChange(1.0/len(names))
            lowcut,highcut=rfwig.percentile(p=[exclude_low_percent,100-exclude_high_percent],bnum=bnum,nonzero_end=nonzero)
            rg=rfwig.regionWithinValueRange(lowcut,highcut)
            if region_out_file!=None:rg.save(region_out_file)
        else:
            print 'calculate total signal in each sample in genomic regions defined by',region_file
            rg=Wig(region_file)
        for name in names:sampling_total[name]=self.data[name].multiply(rg).sum()
        print rg.sum(),'('+str(rg.sum()*100.0/rg.gsize())+'%) of',rg.gsize(),'base pairs calculated:'
        for name in names:print name,sampling_total[name],'('+str(sampling_total[name]*100.0/self.data[name].sum())+'% of total)'

        return sampling_total