def __call__(self, **kw): tinput = track.track(kw.get('track'), chrmeta=kw.get('assembly') or None) wsize = int(kw.get('window_size', size_def)) wstep = int(kw.get('window_step', step_def)) featurewise = kw.get('by_feature', False) if isinstance(featurewise, basestring): featurewise = (featurewise.lower() in ['1', 'true', 't']) output = self.temporary_path(fname='smoothed_track', ext='sql') if featurewise: outfields = tinput.fields datatype = "qualitative" else: outfields = ["start", "end", "score"] datatype = "quantitative" tout = track.track(output, fields=outfields, chrmeta=tinput.chrmeta, info={'datatype': datatype}) for chrom in tout.chrmeta.keys(): tout.write(gm_stream.window_smoothing( tinput.read(selection=chrom, fields=outfields), window_size=wsize, step_size=wstep, featurewise=featurewise), chrom=chrom) tout.close() self.new_file(output, 'smoothed_track') return 1
def __call__(self, **kw): # Create a track with the whole chromosome chrmeta = _get_chrmeta(**kw) sig0 = track(kw['signals'][0]) fields = sig0.fields format = sig0.format is_chr = 'chr' in fields _f0 = ('chr','start','end') if is_chr else ('start','end') _f1 = [f for f in fields if f not in _f0] whole_chr = [] if is_chr: for chr in chrmeta: whole_chr.append( (chr,0,chrmeta[chr]['length'])+('0',)*len(_f1) ) else: fields = [f for f in fields if f not in ['start','end']] fields = ['start','end']+fields for chr in chrmeta: whole_chr.append( (0,chrmeta[chr]['length'])+('0',)*len(_f1) ) whole_chr = FeatureStream(whole_chr,fields=fields) temp = self.temporary_path()+'.'+format with track(temp,fields=fields) as wc: wc.write(whole_chr) kw['signals'] = [temp] + kw['signals'] output = self.temporary_path(fname='combined.') output = _combine(self._func,output,**kw) self.new_file(output, 'combined') return self.display_time()
def __call__(self, **kw): b2wargs = [] control = None if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] bamfile = track(kw['sample'], format='bam') nreads = int(kw.get('normalization') or -1) if nreads < 0: if control is None: nreads = len(set((t[4] for t in bamfile.read()))) else: b2wargs += ["-r"] merge_strands = int(kw.get('merge_strands') or -1) if merge_strands >= 0: suffixes = ["merged"] else: suffixes = ["fwd", "rev"] read_extension = int(kw.get('read_extension') or -1) output = self.temporary_path(fname='density_') with execution(None) as ex: files = bam_to_density(ex, kw['sample'], output, nreads=nreads, merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) for n, x in enumerate(files): tout = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=bamfile.chrmeta, info={'datatype': 'quantitative'}) tout.save() self.new_file(x, 'density_' + suffixes[n]) return 1
def __call__(self, **kw): def _shift(stream, shift): istart = stream.fields.index('start') iend = stream.fields.index('end') i1 = min(istart, iend) i2 = max(istart, iend) def _apply_shift(x): return x[:i1] + (x[i1] + shift,) + x[i1 + 1:i2] + (x[i2] + shift,) + x[i2 + 1:] return track.FeatureStream((_apply_shift(x) for x in stream), fields=stream.fields) tfwd = track.track(kw.get('forward'), chrmeta=kw.get('assembly') or None) trev = track.track(kw.get('reverse'), chrmeta=kw.get('assembly') or None) if not kw.get('assembly'): # btrack does the job, take the max of both chromosome lengths chrmeta = tfwd.chrmeta for k, v in trev.chrmeta.iteritems(): chrmeta.setdefault(k, {})['length'] = max(v['length'], chrmeta.get(k, {}).get('length', 0)) elif tfwd.chrmeta: chrmeta = tfwd.chrmeta # For sql files, btrack doesn't make it, elif trev.chrmeta: chrmeta = trev.chrmeta # so one can contain the info while the second does not. else: raise ValueError("Must specify an assembly.") # In case nothing works - should not happen shiftval = int(kw.get('shift', 0)) if shiftval < 0: # Determine shift automatically shiftval = None xcor_lim = 300 for chrom, v in chrmeta.iteritems(): chrsize = v['length'] xcor_lim = min(xcor_lim, 0.01 * chrsize) xcor = correlation([tfwd.read(chrom), trev.read(chrom)], regions=(1, chrsize), limits=(-xcor_lim, xcor_lim)) max_xcor_idx = xcor.argmax() if xcor[max_xcor_idx] > 0.2: shiftval = (max_xcor_idx - xcor_lim - 1) / 2 #print "Autocorrelation shift=%i, correlation is %f at index %d for chromosome %s." \ # % (shiftval,xcor[max_xcor_idx],max_xcor_idx,chrom) break if not shiftval: raise ValueError("Unable to detect shift automatically. Must specify a shift value.") output = self.temporary_path(fname='density_merged', ext='sql') fields = ['chr', 'start', 'end', 'score'] tout = track.track(output, format='sql', fields=fields, chrmeta=chrmeta, info={'datatype': 'quantitative'}) mode = 'write' for chrom in chrmeta.keys(): tout.write(merge_scores([_shift(tfwd.read(selection=chrom), shiftval[chrom]), _shift(trev.read(selection=chrom), -shiftval[chrom])]), chrom=chrom, mode=mode, clip=True) mode = 'append' tout.close() trev.close() tfwd.close() self.new_file(output, 'density_merged') return 1
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 2): raise ValueError("Please specify an assembly") signals = kw.get('signals', []) if not isinstance(signals, list): signals = [signals] snames = [os.path.splitext(os.path.basename(sig))[0] for sig in signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None if int(kw['mode']) == 0: #correl xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom))] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf) self.new_file(pdf, 'plot_pairs') return self.display_time()
def __call__(self, **kw): if kw.get('input_type') == 'Table': table = kw.get('table') assert os.path.exists(str(table)), "File not found: '%s'" % table with open(table) as t: colnames = t.readline() _f = colnames.strip().split() nscores = len(_f)-1 groups = len(list(set([x.split('.')[0] for x in _f]))) if nscores == 2: # 3 columns, cols 2 and 3 contain the scores sample1 = [2] sample2 = [3] elif len(groups) == 2: # more columns, look if there are two groups of prefixes sample1 = [_f.index(x) for x in _f if x.split('.')==groups[0]] sample2 = [_f.index(x) for x in _f if x.split('.')==groups[1]] else: # not implemented yet, ask the user to choose the columns he wants? Checkboxes... raise ValueError("For the moment, either have only 2 columns of scores, \ or use names of the form <group_name>.<run_id>") else: # Use QuantifyTablePlugin to build a table from score tracks from QuantifyTable import QuantifyTablePlugin # Set QuantifyTablePlugin options kw['score_op'] = 'sum' kw['format'] = 'txt' signals1 = kw.get('signals1',[]) signals2 = kw.get('signals2',[]) if not isinstance(signals1,(list,tuple)): signals1 = [signals1] if not isinstance(signals2,(list,tuple)): signals2 = [signals2] kw['signals'] = signals1 + signals2 signals = kw['signals'] nscores = len(signals) qtable = QuantifyTablePlugin().quantify(**kw) # Remove useless fields and add header based on file names qtable = track(qtable, format='txt', fields=['chr','start','end','name']+['score'+str(i) for i in range(nscores)]) table = self.temporary_path('scores_table.txt') _f = ['score'+str(i) for i in range(nscores)] strack = track(table, fields=['name']+_f) signal_tracks = [track(s) for s in signals] signames = [s.info.get('name',os.path.splitext(os.path.basename(s.path))[0]) for s in signal_tracks] strack.write([('Name',signames[0],signames[1])]) strack.write(qtable.read(fields=strack.fields)) sample1 = range(len(signals1)) sample2 = range(nscores-len(signals1)) output_filename = MAplot(table, cols={1:sample1, 2:sample2}) output = self.temporary_path(fname='maplot.png') shutil.move(output_filename,output) self.new_file(output, 'MA-plot') return self.display_time()
def __call__(self, **kw): features = track(kw.get('features')) signal = [track(sig, chrmeta=features.chrmeta) for sig in kw.get('signal', [])] labels = None data = None for chrom in features.chrmeta: _l, _d = feature_matrix([s.read(chrom) for s in signal], features.read(chrom), segment=True) if _d.size == 0: continue if data is None: labels = _l data = _d else: labels = concatenate((labels, _l)) data = vstack((data, _d)) pdf = self.temporary_path(fname='plot_features', ext='.pdf') if data is None: raise ValueError("No data") kw['mode'] = int(kw.get('mode', 0)) if kw['mode'] == 0: new = True for n in range(data.shape[-1] - 1): heatmap(data[:, :, n], output=pdf, new=new, last=False, rows=labels, orderRows=True, orderCols=False) new = False heatmap(data[:, :, -1], output=pdf, new=new, last=True, rows=labels, orderRows=True, orderCols=False) elif kw['mode'] == 1: X = range(data.shape[1]) Y = data.mean(axis=0) lineplot(X, [Y[:, n] for n in range(data.shape[-1])], output=pdf, new=True, last=True) elif kw['mode'] == 2: X = range(data.shape[1]) new = True mfrow = [4, 3] nplot = min(data.shape[0], max_pages * mfrow[0] * mfrow[1]) for reg in range(nplot - 1): lineplot(X, [data[reg, :, n] for n in range(data.shape[-1])], output=pdf, new=new, last=False, mfrow=mfrow) new = False mfrow = [] lineplot(X, [data[nplot - 1, :, n] for n in range(data.shape[-1])], output=pdf, new=new, last=True) else: raise ValueError("Mode not implemented: %s" % kw['mode']) self.new_file(pdf, 'plot_features') return 1
def _combine(func,output,**kw): chrmeta = _get_chrmeta(**kw) format = kw.get('format','sql') output += format signals = kw.get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] tout = track(output, chrmeta=chrmeta, info={'datatype':'qualitative'}) for chrom in chrmeta: trackList = [sig.read(chrom) for sig in signals] res = combine(trackList, fn=func) tout.fields = res.fields tout.write(res, chrom=chrom, clip=True) tout.close() return output
def test_subtract(self): self.subtract(**self.kw) with track(self.subtract.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1',21,24,17.0)] self.assertListEqual(content,expected)
def test_complement(self): self.complement(**self.kw) with track(self.complement.output_files[0][0]) as t: s = t.read('chr1',fields=self.fields) content = list(s) expected = [('chr1',0,8,0.0),('chr1',19,21,0.0),('chr1',39,197195432,0.0)] self.assertListEqual(content,expected)
def test_intersect(self): self.intersect(**self.kw) with track(self.intersect.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1',10,15,17.0),('chr1',24,35,107.0)] self.assertListEqual(content,expected)
def test_quantify_table_text(self): self.plugin(**{'input_type':'Signal','signals':[path+'KO50.bedGraph', path+'WT50.bedGraph'], 'features':path+'features.bed', 'feature_type':3, 'assembly':'mm9', 'format':'txt'}) with track(self.plugin.output_files[0][0], fields=["chr","start","end","name","score0","score1"]) as t: s = t.read() content = list(s) self.assertEqual(len(content),9)
def test_union(self): self.union(**self.kw) with track(self.union.output_files[0][0]) as t: s = t.read(fields=self.fields) content = list(s) expected = [('chr1',8,10,12.0),('chr1',10,15,17.0),('chr1',15,19,12.0), ('chr1',21,24,17.0),('chr1',24,35,107.0),('chr1',35,39,90.0)] self.assertListEqual(content,expected)
def quantify(self,**kw): feature_type = int(kw.get('feature_type', 0)) func = str(kw.get('score_op', 'mean')) assembly_id = kw.get('assembly') format = kw.get('format','sql') chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] if feature_type == 0: features = genes elif feature_type == 1: prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: features = exons elif feature_type == 3: assert os.path.exists(str(kw.get('features'))), "Features file not found: '%s'" % kw.get("features") _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Take feature_type in %s." %ftypes) output = self.temporary_path(fname='features_quantification.'+format) if len(signals) > 1: _f = ["score" + str(i) for i in range(len(signals))] else: _f = ["score"] tout = track(output, format, fields=['chr','start','end','name'] + _f, chrmeta=chrmeta, info={'datatype':'qualitative'}) for chrom in chrmeta: sread = [sig.read(chrom) for sig in signals] tout.write(score_by_feature(sread, features(chrom), fn=func), chrom=chrom, clip=True) tout.close() return output
def __call__(self, **kw): b2wargs = [] control = None sample = kw.get("sample") assert os.path.exists(str(sample)), "Bam file not found: '%s'." % sample if kw.get('control'): control = kw['control'] b2wargs = ["-c", str(control)] assert os.path.exists(str(control)), "Control file not found: '%s'." % control control = os.path.abspath(control) sample = os.path.abspath(sample) nreads = int(kw.get('normalization') or -1) bamfile = track(sample, format='bam') if nreads < 0: if control is None: nreads = len(set((t[4] for t in bamfile.read()))) else: b2wargs += ["-r"] merge_strands = int(kw.get('merge_strands') or -1) read_extension = int(kw.get('read_extension') or -1) output = self.temporary_path(fname='density_') format = kw.get("format", "sql") with execution(None) as ex: files = bam_to_density(ex, sample, output, nreads=nreads, merge=merge_strands, read_extension=read_extension, sql=True, args=b2wargs) if merge_strands >= 0: suffixes = ["merged"] else: suffixes = ["fwd", "rev"] for n, x in enumerate(files): tsql = track(x, format='sql', fields=['start', 'end', 'score'], chrmeta=bamfile.chrmeta, info={'datatype': 'quantitative'}) tsql.save() if format == "sql": outname = x else: outname = os.path.splitext(x)[0]+"."+format convert(x, outname, mode="overwrite") self.new_file(outname, 'density_'+suffixes[n]) return self.display_time()
def __call__(self, **kw): assembly_id = kw.get('assembly') or None assembly = genrep.Assembly(assembly_id) tinput = track(kw.get('track'), chrmeta=assembly.chrmeta) thPromot = int(kw.get("promoter", prom_def)) thInter = int(kw.get('intergenic', inter_def)) thUTR = int(kw.get('UTR', utr_def)) output = self.temporary_path(fname='Annotated_table.txt') tout = track(output, format='txt', fields=['chr', 'start', 'end', 'name', 'strand', 'gene', 'location_type', 'distance']) mode = 'write' for chrom in assembly.chrnames: tout.write(gm_stream.getNearestFeature( tinput.read(selection=chrom), assembly.gene_track(chrom), thPromot, thInter, thUTR), mode=mode) mode = 'append' tout.close() self.new_file(output, 'table') return 1
def guess_vizualisations(fileinfo): debug('guess vizualisation', 3) if not fileinfo.extension == 'sql': fileinfo.vizualisations.extend(mappings['viz'][fileinfo.extension]) debug(', '.join(fileinfo.vizualisations), 4) return fileinfo dt = btrack.track(fileinfo.paths['upload_to']).info['datatype'] if dt is not None and dt.lower() in mappings['viz']: fileinfo.vizualisations.extend(mappings['viz'][dt.lower()]) debug(', '.join(fileinfo.vizualisations), 4) return fileinfo raise Exception('Cannot guess the vizualisation for file "%s".' % fileinfo.trackname)
def __call__(self, **kw): feature_type = int(kw.get('feature_type', 0)) func = str(kw.get('score_op', 'mean')) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track elif not(feature_type == 2): raise ValueError("Please specify an assembly") signals = [track(sig, chrmeta=chrmeta) for sig in kw.get('signals', [])] if feature_type == 0: features = genes elif feature_type == 1: prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: return 2 output = self.temporary_path(fname='features_quantification.sql') if len(signals) > 1: _f = ["score" + str(i) for i in range(len(signals))] else: _f = ["score"] tout = track(output, format='sql', fields=['start', 'end', 'name'] + _f, chrmeta=chrmeta, info={'datatype': 'qualitative'}) for chrom in chrmeta: sread = [sig.read(chrom) for sig in signals] tout.write(score_by_feature(sread, features(chrom), fn=func), chrom=chrom, clip=True) tout.close() self.new_file(output, 'features_quantification') return 1
def test_smoothing(self): self.plugin(**{'track':path+'KO50.bedGraph', 'assembly':'mm9', 'format':'bedGraph'}) with track(self.plugin.output_files[0][0]) as t: content = list(t.read()) self.assertEqual(len(content),501)
def __call__(self, **kw): if kw.get('input_type') == 'Table': filename = kw.get('table') assert os.path.exists(str(filename)), "File not found: '%s'" % filename colnames = numpy.asarray(open(filename).readline().split()[1:]) robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames)) robjects.r(""" Mdata <- read.table('%s',sep='\t',header=T,row.names=1) conds <- unlist(strsplit(col_names,".",fixed=T)) conds <- colnames(Mdata) """ % filename) else: from QuantifyTable import QuantifyTablePlugin assembly = genrep.Assembly(kw.get('assembly')) chrmeta = assembly.chrmeta or "guess" kw['score_op'] = 'sum' signals1 = kw.get('signals1',[]) signals2 = kw.get('signals2',[]) if not isinstance(signals1,(list,tuple)): signals1 = [signals1] if not isinstance(signals2,(list,tuple)): signals2 = [signals2] kw['signals'] = signals1 + signals2 signals = kw['signals'] table = QuantifyTablePlugin().quantify(**kw) stracks = [] norm_factors = [] for sig in signals: assert os.path.exists(str(sig)), "Signal file not found: '%s'." % sig _t = track(sig, chrmeta=chrmeta) if 'normalization' in _t.info: print 'normalized' _nf = float(_t.info['normalization']) elif 'nreads' in _t.info: print 'nreads' _nf = float(_t.info['nreads']) * 1e-7 / float(_t.info.get('read_extension', 1)) else: _nf = 1 stracks.append(_t) norm_factors.append(_nf) t = track(table,chrmeta=chrmeta) _f = [f for f in t.fields if f.startswith('score')] de_list = list(t.read(fields=['name']+_f)) t.close(); os.remove(table) # Turn all scores into integers de_matrix = numpy.asarray([[int(float(s) * norm_factors[k] + .5) for k,s in enumerate(x[1:])] for x in de_list], dtype=numpy.float) rownames = numpy.asarray([x[0] for x in de_list]) colnames = numpy.asarray([s.info.get('name',os.path.splitext(os.path.basename(s.path))[0]) for s in stracks]) # if all prefixes are identical within a group, keep this prefix as group identifier. if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \ and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1: group1 = colnames[0].split('.')[0] group2 = colnames[-1].split('.')[0] else: group1 = "Group1" group2 = "Group2" conds = [group1]*len(signals1) + [group2]*len(signals2) robjects.r.assign('Mdata', numpy2ri.numpy2ri(de_matrix)) robjects.r.assign('row_names', numpy2ri.numpy2ri(rownames)) robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames)) robjects.r.assign('conds', numpy2ri.numpy2ri(conds)) robjects.r(""" Mdata <- as.data.frame(Mdata,row.names=row_names) conds <- unlist(col_names) colnames(Mdata) <- conds """) robjects.r(""" ### Still need to check that replicates are not identical - lfproc would fail groups <- unique(conds) couples <- combn(groups,2) if (any(table(conds)>1)){ method = 'normal' # if replicates } else { method = 'blind' } """) robjects.r(""" library(DESeq) cds <- newCountDataSet(Mdata, conds) cds <- estimateSizeFactors(cds) cds <- estimateVarianceFunctions(cds,method='blind') """) groups = list(set(colnames)) couples = itertools.combinations(groups, 2) output = self.temporary_path(fname='DE') for c in couples: out = output + '_' + c[0] + '-' + c[1] + '.txt' r_cmd = """ res <- nbinomTest(cds, '%s', '%s') res <- res[order(res[,8]),] write.table(res, '%s', row.names=F, quote=F, sep='\t') """ % (c[0], c[1], out) robjects.r(r_cmd) if kw.get('complete') is None: clean = self.clean_deseq_output(out,c) shutil.move(clean,out) self.new_file(out, 'differential_expression') return self.display_time()
def __call__(self, **kw): chrmeta = "guess" features = track(kw.get('features'), chrmeta=chrmeta) signals = kw.get('signals', []) if not isinstance(signals, list): signals = [signals] snames = [os.path.splitext(os.path.basename(sig))[0] for sig in signals] signals = [track(sig) for sig in signals] labels = None data = None for chrom in features.chrmeta: _l, _d = feature_matrix([s.read(chrom) for s in signals], features.read(chrom), segment=True, nbins=nbins, upstream=upstr, downstream=downstr) if _d.size == 0: continue if data is None: labels = _l data = _d else: labels = concatenate((labels, _l)) data = vstack((data, _d)) pdf = self.temporary_path(fname='plot_features.pdf') if data is None: raise ValueError("No data") kw['mode'] = int(kw.get('mode', 0)) X = array(range(-upstr[1]+1,nbins+downstr[1]+1))/(1.0*nbins) if kw['mode'] == 0: #heatmap new = True for n in range(data.shape[-1]-1): heatmap(data[:, :, n], output=pdf, new=new, last=False, rows=labels, columns=X, main=snames[n], orderRows=True, orderCols=False) new = False heatmap(data[:, :, -1], output=pdf, new=new, last=True, rows=labels, columns=X, main=snames[-1], orderRows=True, orderCols=False) elif kw['mode'] == 1: #average lineplot Y = data.mean(axis=0) ymin = min([x.min() for x in Y]+[0]) ymax = max([x.max() for x in Y]) lineplot(X, [Y[:, n] for n in range(data.shape[-1])], output=pdf, new=True, last=True, legend=snames, ylim=(ymin,ymax)) elif kw['mode'] == 2: #mosaic new = True mfrow = [4, 3] nplot = min(data.shape[0], max_pages*mfrow[0]*mfrow[1]) ymin = min([data.min(),0]) ymax = data.max() for reg in range(nplot-1): lineplot(X, [data[reg, :, n] for n in range(data.shape[-1])], output=pdf, new=new, last=False, mfrow=mfrow, main=labels[reg], ylim=(ymin,ymax)) new = False mfrow = [] lineplot(X, [data[nplot-1, :, n] for n in range(data.shape[-1])], output=pdf, new=new, last=True, main=labels[-1], legend=snames, ylim=(ymin,ymax)) else: raise ValueError("Mode not implemented: %s" % kw['mode']) self.new_file(pdf, 'plot_features') return self.display_time()
def motif_scan_to_track(self, fasta, motifName, motif, background, threshold, chrmeta, output=None): """Perform a motif scan and write the results to a track. It executes motif_scan(fasta, motif, background, threshold) and inserts all results back into a track. If the output track is None, a SQL track is created. Returns the track""" #The buffer size (used to speed up insertion into SQL tracks althoug it will probably help for most formats) COLLECT_SIZE = 1000 results = self.motif_scan(fasta, motif, background, threshold) if output == None: output = self.temporary_path(fname='motif_finder_results', ext='sql') track_output = track(output, fields=['start','end','score','name', 'strand'], chrmeta=chrmeta, info={'datatype':'features'}) #Sample: "chr1|chr1:1-230207" -> ["chr1", "chr1", "1", "230207"] parse_name = re.compile("^(.*)\|(.*):(.*)-(.*)$") lines = results.splitlines() features = [] for line in lines: # name: Name of the FASTA part # seq: Matched sequence # score: Score # pos: Starting position (1 -> first nucleotide) # strand: +/- -> Watson/Crick [name, seq, score, pos, strand] = line.split("\t") score = float(score) pos = int(pos) -1 length = len(seq) regionFrom = 0 regionTo = length #Name parsing is a bit more complicated as we need handle more different cases. If the name is in the assembly #format, it can be parsed by the parse_name regex. If not, the name is taken as-is and the positions #(regionFrom & regionTo) are assumed to be simple (0 -> length). fullName = motifName #Parse name if parse_name.match(name) != None: #Sample: ">chr1|chr1:1-230207" [(name, _, regionFrom, regionTo)] = parse_name.findall(name) regionFrom = int(regionFrom) regionTo = int(regionTo) #Generate a more explicit name if chrmeta[name] != None: if hasattr(chrmeta[name], 'real_name') and chrmeta[name]['real_name'] != None: fullName = chrmeta[name]['real_name']+" - "+motifName #Most track formats doesn't handle the case where to < from -> flip to correct if regionTo < regionFrom: strand = "+" if strand.strip() == "-" else "+" [regionFrom, regionTo] = [regionTo, regionFrom] features.append((name,regionFrom+pos,regionFrom+pos+length, score, fullName, strand)) if len(features) >= COLLECT_SIZE: #Buffer full -> flush stream = FeatureStream(features, fields=['chr','start','end','score','name','strand']) track_output.write(stream) features = [] if len(features) > 0: #Finished -> flush stream = FeatureStream(features, fields=['chr','start','end','score','name','strand']) track_output.write(stream) track_output.close() return output
def __call__(self, **kw): feature_type = int(kw.get('feature_type', 0)) assembly_id = kw.get('assembly') chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 3): raise ValueError("Please specify an assembly") if feature_type == 0: features = genes elif feature_type == 1: prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: features = exons elif feature_type == 3: assert os.path.exists(kw.get('features')) _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: return 2 signals = [] norm_factors = [] for sig in kw.get('signals', []): assert os.path.exists(sig), "File not found: %s." % sig _t = track(sig, chrmeta=chrmeta) if 'normalization' in _t.info: _nf = float(_t.info['normalization']) elif 'nreads' in _t.info: _nf = float(_t.info['nreads']) * 1e-7 / float(_t.info.get('read_extension', 1)) else: _nf = 1 signals.append(_t) norm_factors.append(_nf) if len(signals) > 1: _f = ["score" + str(i) for i in range(len(signals))] else: _f = ["score"] de_list = [] for chrom in chrmeta: sread = [sig.read(chrom) for sig in signals] mread = score_by_feature(sread, features(chrom), fn='sum') de_list.extend(list(mread)) name_idx = mread.fields.index("name") # Turn all scores into integers de_matrix = numpy.asarray([[int(s * norm_factors[k] + .5) for s in x[-len(_f):]] for k, x in enumerate(de_list)], dtype=numpy.float) rownames = numpy.asarray([x[name_idx] for x in de_list]) colnames = numpy.asarray([os.path.splitext(os.path.basename(s.path))[0] for s in signals]) del de_list output = self.temporary_path(fname='DE') robjects.r.assign('Mdata', numpy2ri.numpy2ri(de_matrix)) robjects.r.assign('row_names', numpy2ri.numpy2ri(rownames)) robjects.r.assign('col_names', numpy2ri.numpy2ri(colnames)) robjects.r(""" Mdata <- as.data.frame(Mdata,row.names=row_names) conds <- unlist(strsplit(col_names,".",fixed=T)) colnames(Mdata) <- conds groups <- unique(conds) couples <- combn(groups,2) # Still need to check that replicates are not identical - lfproc would fail if (any(table(conds)>1)){ method = 'normal' # if replicates } else { method = 'blind' } library(DESeq) cds <- newCountDataSet(Mdata, conds) cds <- estimateSizeFactors(cds) cds <- estimateVarianceFunctions(cds,method='blind') """) groups = list(set(colnames)) couples = itertools.combinations(groups, 2) for c in couples: out = output + '_' + c[0] + '-' + c[1] + '.txt' print out r_cmd = """ res <- nbinomTest(cds, '%s', '%s') res <- res[order(res[,8]),] write.table(res, '%s', row.names=F) """ % (c[0], c[1], out) robjects.r(r_cmd) self.new_file(out, 'differential_expression') return 1