def test_merge_scores(self): # Arithmetic mean s1 = fstream([('chr',10,20,6.,'A')], fields=['chr','start','end','score','name']) s2 = fstream([('chr',5,15,2.,'B')], fields=['chr','start','end','score','name']) res = list(merge_scores([s1,s2])) expected = [('chr',5,10,1.,'B'),('chr',10,15,4.,'A|B'),('chr',15,20,3.,'A')] self.assertListEqual(res,expected) # Geometric mean s1 = fstream([(10,20,6.)], fields=['start','end','score']) s2 = fstream([(5,15,2.)], fields=['start','end','score']) res = list(merge_scores([s1,s2], method='geometric')) expected = [(5,10,math.sqrt(2)),(10,15,math.sqrt(12)),(15,20,math.sqrt(6))] self.assertListEqual(res,expected) # Sum s1 = fstream([(10,20,6.)], fields=['start','end','score']) s2 = fstream([(5,15,2.)], fields=['start','end','score']) res = list(merge_scores([s1,s2], method='sum')) expected = [(5,10,2.),(10,15,8.),(15,20,6.)] self.assertListEqual(res,expected)
def merge(*args,**kw): if not(kw['forward'] and os.path.exists(kw['forward'])): raise Usage("Specify a valid forward strand density file with -f.") if not(kw['reverse'] and os.path.exists(kw['reverse'])): raise Usage("Specify a valid reverse strand density file with -r.") if not(kw['output']): raise Usage("Specify the output file name.") def _shift(stream,shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart,iend) i2 = max(istart,iend) def _apply_shift(x): return x[:i1]+(x[i1]+shift,)+x[i1+1:i2]+(x[i2]+shift,)+x[i2+1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) fields = ['chr','start','end','score'] chrmeta = _get_chrmeta(**kw) tfwd = track.track(kw['forward'],format=kw['formatf'],chrmeta=chrmeta) trev = track.track(kw['reverse'],format=kw['formatr'],chrmeta=chrmeta) if tfwd.chrmeta: chrmeta = tfwd.chrmeta elif trev.chrmeta: chrmeta = trev.chrmeta else: raise Usage("Specify an assembly with -a.") shiftval = int(kw['shift']) if shiftval < 0: slim = 300 chrsize,chrom = sorted([(v['length'],k) for k,v in chrmeta.iteritems()],reverse=True)[0] xcor = correlation([tfwd.read(chrom),trev.read(chrom)], (1,chrsize),limits=(-slim,slim)) shiftval = (xcor.argmax()-slim-1)/2 print "Autocorrelation shift=%i, correlation is %f." %(shiftval,xcor.max()) tout = track.track(kw['output'],fields=fields, chrmeta=chrmeta,info={'datatype':'quantitative'}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(chrom), shiftval), _shift(trev.read(chrom),-shiftval)], method=method), chrom=chrom,mode=mode,clip=True) mode = 'append' tout.close() trev.close() tfwd.close() return 0
def __call__(self,**kw): assembly = kw.get('assembly') or 'guess' t1 = track(kw['numerator'],chrmeta=assembly) t2 = track(kw['denominator'],chrmeta=assembly) format = kw.get('output') or t1.format wsize = int(kw.get('window_size') or size_def) self.log = kw.get('log',False) if isinstance(self.log, basestring): self.log = (self.log.lower() in ['1', 'true', 't','on']) try: self.pseudo = float(kw.get('pseudo')) except: self.pseudo = pseudo_def self.baseline = -log(self.pseudo,2) try: self.threshold = float(kw.get('threshold')) except: self.threshold = threshold_def distribution = kw.get('distribution',False) if isinstance(distribution, basestring): distribution = (distribution.lower() in ['1', 'true', 't','on']) if distribution: genome_length = sum((v['length'] for v in t1.chrmeta.values())) self.shifts = list(poisson(float(genome_length)/float(self.sample_num),self.sample_num)) self.ratios = [] output = self.temporary_path(fname='ratios_%s-%s.%s'%(t1.name,t2.name,format)) with track(output, chrmeta=t1.chrmeta, fields=t1.fields, info={'datatype': 'quantitative', 'log': self.log, 'pseudocounts': self.pseudo, 'threshold': self.threshold, 'window_size': wsize}) as tout: for chrom,vchr in t1.chrmeta.iteritems(): if wsize > 1: s1 = window_smoothing(t1.read(chrom),window_size=wsize,step_size=1,featurewise=False) s2 = window_smoothing(t2.read(chrom),window_size=wsize,step_size=1,featurewise=False) else: s1 = t1.read(chrom) s2 = t2.read(chrom) s3 = merge_scores([s1,s2],method=self._divide) if distribution: s3 = FeatureStream(self._sample_stream(s3,vchr['length']),fields=s3.fields) tout.write(s3, chrom=chrom, clip=True) self.new_file(output, 'ratios') if distribution: pdf = self.temporary_path(fname='%s-%s_ratios_distribution.pdf'%(t1.name,t2.name)) density_boxplot(self.ratios,output=pdf, name="%s/%s (median=%.2f)" %(t1.name,t2.name,median(self.ratios))) self.new_file(pdf, 'boxplot') return self.display_time()
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) outfields = [f for f in tfwd.fields if f in trev.fields] tout = track(output, chrmeta=chrmeta, fields=outfields, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def test_merge_scores(self): # Arithmetic mean s1 = fstream([('chr', 10, 20, 6., 'A')], fields=['chr', 'start', 'end', 'score', 'name']) s2 = fstream([('chr', 5, 15, 2., 'B')], fields=['chr', 'start', 'end', 'score', 'name']) res = list(merge_scores([s1, s2])) expected = [('chr', 5, 10, 1., 'B'), ('chr', 10, 15, 4., 'A|B'), ('chr', 15, 20, 3., 'A')] self.assertListEqual(res, expected) # Geometric mean s1 = fstream([(10, 20, 6.)], fields=['start', 'end', 'score']) s2 = fstream([(5, 15, 2.)], fields=['start', 'end', 'score']) res = list(merge_scores([s1, s2], method='geometric')) expected = [(5, 10, math.sqrt(2)), (10, 15, math.sqrt(12)), (15, 20, math.sqrt(6))] self.assertListEqual(res, expected) # Sum s1 = fstream([(10, 20, 6.)], fields=['start', 'end', 'score']) s2 = fstream([(5, 15, 2.)], fields=['start', 'end', 'score']) res = list(merge_scores([s1, s2], method='sum')) expected = [(5, 10, 2.), (10, 15, 8.), (15, 20, 6.)] self.assertListEqual(res, expected)
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) assembly = kw.get('assembly') or 'guess' tfwd = track(kw.get('forward'), chrmeta=assembly) trev = track(kw.get('reverse'), chrmeta=assembly) chrmeta = tfwd.chrmeta shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1)/2 break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname=tfwd.name+'-'+trev.name+'_merged', ext=kw.get('format',tfwd.format)) tout = track(output, chrmeta=chrmeta, info={'datatype': 'quantitative', 'shift': shiftval}) mode = 'write' method = kw.get("method","mean") for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval), _shift(trev.read(selection=chrom), -shiftval)], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return self.display_time()
def __call__(self, **kw): b2wargs = [] control = None samples = kw.get('BamMulti', {}).get('sample', []) if not isinstance(samples, list): samples = [samples] samples = [os.path.abspath(s) for s in samples if os.path.exists(s)] if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists( str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) try: nreads = int(kw.get('normalization')) except (ValueError, TypeError): nreads = -1 bamfiles = [track(s, format='bam') for s in samples] if nreads < 0: _nreads = [0] * len(samples) if control is not None: b2wargs += ["-r"] else: _nreads = [nreads for s in samples] try: merge_strands = int(kw.get('merge_strands')) except (ValueError, TypeError): merge_strands = -1 try: read_extension = int(kw.get('read_extension')) except (ValueError, TypeError): read_extension = -1 output = [ self.temporary_path(fname=b.name + '_density_') for b in bamfiles ] format = kw.get("format", "sql") with execution(None) as ex: files = [ bam_to_density(ex, s, output[n], nreads=_nreads[n], merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) for n, s in enumerate(samples) ] info = {'datatype': 'quantitative', 'read_extension': read_extension} if merge_strands >= 0: suffixes = ["merged"] info['shift'] = merge_strands else: suffixes = ["fwd", "rev"] chrmeta = bamfiles[0].chrmeta for suf in suffixes: all_s_files = [ x for y in files for x in y if x.endswith(suf + ".sql") ] if len(all_s_files) > 1: x = self.temporary_path(fname="Density_average_" + suf + ".sql") tsql = track(x, fields=['start', 'end', 'score'], chrmeta=chrmeta, info={'datatype': 'quantitative'}) insql = [] for f in all_s_files: t = track(f, format='sql', chrmeta=chrmeta) t.save() insql.append(t) for c in tsql.chrmeta: tsql.write(merge_scores([t.read(c) for t in insql]), chrom=c) else: x = all_s_files[0] tsql = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=chrmeta, info=info) tsql.save() if format in [None, "sql"]: outname = x else: outname = os.path.splitext(x)[0] + "." + format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_' + suf) return self.display_time()
def merge(*args, **kw): if not (kw['forward'] and os.path.exists(kw['forward'])): raise Usage("Specify a valid forward strand density file with -f.") if not (kw['reverse'] and os.path.exists(kw['reverse'])): raise Usage("Specify a valid reverse strand density file with -r.") if not (kw['output']): raise Usage("Specify the output file name.") def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift, ) + x[i1 + 1:i2] + ( x[i2] + shift, ) + x[i2 + 1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) fields = ['chr', 'start', 'end', 'score'] chrmeta = _get_chrmeta(**kw) tfwd = track.track(kw['forward'], format=kw['formatf'], chrmeta=chrmeta) trev = track.track(kw['reverse'], format=kw['formatr'], chrmeta=chrmeta) if tfwd.chrmeta: chrmeta = tfwd.chrmeta elif trev.chrmeta: chrmeta = trev.chrmeta else: raise Usage("Specify an assembly with -a.") shiftval = int(kw['shift']) if shiftval < 0: slim = 300 chrsize, chrom = sorted([(v['length'], k) for k, v in chrmeta.iteritems()], reverse=True)[0] xcor = correlation( [tfwd.read(chrom), trev.read(chrom)], (1, chrsize), limits=(-slim, slim)) shiftval = (xcor.argmax() - slim - 1) / 2 print "Autocorrelation shift=%i, correlation is %f." % (shiftval, xcor.max()) tout = track.track(kw['output'], fields=fields, chrmeta=chrmeta, info={'datatype': 'quantitative'}) mode = 'write' method = kw.get("method", "mean") for chrom in chrmeta.keys(): tout.write(merge_scores([ _shift(tfwd.read(chrom), shiftval), _shift(trev.read(chrom), -shiftval) ], method=method), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() return 0
def __call__(self, **kw): assembly = kw.get('assembly') or 'guess' t1 = track(kw['numerator'], chrmeta=assembly) t2 = track(kw['denominator'], chrmeta=assembly) format = kw.get('format') or t1.format wsize = int(kw.get('window_size') or size_def) self.log = kw.get('log', False) if isinstance(self.log, basestring): self.log = (self.log.lower() in ['1', 'true', 't', 'on']) try: self.pseudo = float(kw.get('pseudo')) except: self.pseudo = pseudo_def self.baseline = -log(self.pseudo, 2) try: self.threshold = float(kw.get('threshold')) except: self.threshold = threshold_def distribution = kw.get('distribution', False) if isinstance(distribution, basestring): distribution = (distribution.lower() in ['1', 'true', 't', 'on']) if distribution: genome_length = sum((v['length'] for v in t1.chrmeta.values())) self.shifts = list( poisson( float(genome_length) / float(self.sample_num), self.sample_num)) self.ratios = [] output = self.temporary_path(fname='ratios_%s-%s.%s' % (t1.name, t2.name, format)) with track(output, chrmeta=t1.chrmeta, fields=t1.fields, info={ 'datatype': 'quantitative', 'log': self.log, 'pseudocounts': self.pseudo, 'threshold': self.threshold, 'window_size': wsize }) as tout: for chrom, vchr in t1.chrmeta.iteritems(): if wsize > 1: s1 = window_smoothing(t1.read(chrom), window_size=wsize, step_size=1, featurewise=False) s2 = window_smoothing(t2.read(chrom), window_size=wsize, step_size=1, featurewise=False) else: s1 = t1.read(chrom) s2 = t2.read(chrom) s3 = merge_scores([s1, s2], method=self._divide) if distribution: s3 = FeatureStream(self._sample_stream(s3, vchr['length']), fields=s3.fields) tout.write(s3, chrom=chrom, clip=True) self.new_file(output, 'ratios') if distribution: pdf = self.temporary_path(fname='%s-%s_ratios_distribution.pdf' % (t1.name, t2.name)) density_boxplot(self.ratios, output=pdf, name="%s/%s (median=%.2f)" % (t1.name, t2.name, median(self.ratios))) self.new_file(pdf, 'boxplot') return self.display_time()
def __call__(self, **kw): b2wargs = [] control = None samples = kw.get('BamMulti',{}).get('sample', []) if not isinstance(samples, list): samples = [samples] samples = [os.path.abspath(s) for s in samples if os.path.exists(s)] if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists(str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) try: nreads = int(kw.get('normalization')) except (ValueError, TypeError): nreads = -1 bamfiles = [track(s, format='bam') for s in samples] if nreads < 0: _nreads = [0]*len(samples) if control is not None: b2wargs += ["-r"] else: _nreads = [nreads for s in samples] try: merge_strands = int(kw.get('merge_strands')) except (ValueError, TypeError): merge_strands = -1 try: read_extension = int(kw.get('read_extension')) except (ValueError, TypeError): read_extension = -1 output = [self.temporary_path(fname=b.name+'_density_') for b in bamfiles] format = kw.get("format", "sql") with execution(None) as ex: files = [bam_to_density( ex, s, output[n], nreads=_nreads[n], merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs ) for n,s in enumerate(samples)] info = {'datatype': 'quantitative', 'read_extension': read_extension} if merge_strands >= 0: suffixes = ["merged"] info['shift'] = merge_strands else: suffixes = ["fwd", "rev"] chrmeta = bamfiles[0].chrmeta for suf in suffixes: all_s_files = [x for y in files for x in y if x.endswith(suf+".sql")] if len(all_s_files) > 1: x = self.temporary_path(fname="Density_average_"+suf+".sql") tsql = track( x, fields=['start', 'end', 'score'], chrmeta=chrmeta, info={'datatype': 'quantitative'} ) insql = [] for f in all_s_files: t = track(f, format='sql', chrmeta=chrmeta) t.save() insql.append(t) for c in tsql.chrmeta: tsql.write(merge_scores([t.read(c) for t in insql]),chrom=c) else: x = all_s_files[0] tsql = track( x, format='sql', fields=['start', 'end', 'score'], chrmeta=chrmeta, info=info ) tsql.save() if format in [None,"sql"]: outname = x else: outname = os.path.splitext(x)[0]+"."+format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_'+suf) return self.display_time()