def __call__(self, **kw): # Set assembly assembly_id = kw.get('assembly') chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta # Set features track features = track(kw['features'], chrmeta=chrmeta or None) chrmeta = features.chrmeta # Set filter track filter = track(kw.get('filter'), chrmeta=chrmeta or None) # Main format = kw.get('format', features.format) output = self.temporary_path(fname=features.name + '_filtered.' + format) tout = track(output, format, fields=filter.fields, chrmeta=chrmeta, info={'datatype': 'qualitative'}) for chrom in chrmeta: tout.write(overlap(features.read(chrom), filter.read(chrom)), chrom=chrom, clip=True) tout.close() self.new_file(output, 'filtered') return self.display_time()
def test_annotate_snps(self): assembly = genrep.Assembly('sacCer2') filedict = {'chrV':path+'chrV'} self.outall, self.outexons = annotate_snps(filedict, ["s1","s2"], assembly) with open(self.outall,'r') as f: print "\nAll SNPs ('outall'):\n",f.read() with open(self.outexons,'r') as g: print "\nExonic SNPs ('outexons'):\n",g.read() raise IOError("Error raised voluntarily to print test outputs.")
def _get_chrmeta(**kw): chrmeta = "guess" assembly_id = kw.get('assembly') if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta return chrmeta
def __call__(self, **kw): input_type = kw.get('input_type', 0) if str(input_type) in [str(x[0]) for x in input_types]: input_type = int(input_type) if input_type in input_types[0]: #fasta fasta = kw.get('fastafile') name = os.path.splitext(os.path.basename(fasta))[0] assembly = genrep.Assembly(fasta=fasta) size = None elif input_type in input_types[1]: #regions assembly = genrep.Assembly(kw.get('assembly')) regions_file = kw.get('regions') or '' if not os.path.exists(regions_file): raise ValueError("File not found: %s" % regions_file) regions = track(regions_file, chrmeta=assembly.chrmeta) name = regions.name gRef = assembly.fasta_by_chrom fasta = self.temporary_path(fname=regions.name + '.fa') (fasta, size) = assembly.fasta_from_regions(list( regions.read(fields=['chr', 'start', 'end'])), out=fasta, path_to_ref=gRef) else: raise ValueError("Input type not implemented: %s" % input_type) fasta = os.path.abspath(fasta) background = assembly.statistics( self.temporary_path(fname="background"), frequency=True) output = self.temporary_path(fname=name + "_meme.tgz") outdir = os.path.join(os.path.split(fasta)[0], name + "_meme") meme_args = kw.get("meme_args", []) nmotifs = kw.get('nmotifs') or _nm if not '-nmotifs' in meme_args: meme_args += ['-nmotifs', "%i" % int(nmotifs)] with execution(None) as ex: if size is None: size = sum(fasta_length(ex, fasta).values()) meme_out = meme(ex, fasta, outdir, background, maxsize=(size * 3) / 2, args=meme_args) tarf = tarfile.open(output, "w:gz") tarf.add(outdir, arcname=os.path.basename(outdir)) tarf.add(fasta, arcname=os.path.basename(fasta)) tarf.close() self.new_file(output, 'meme_archive') return self.display_time()
def __call__(self, **kw): assembly = kw.get('assembly') or 'guess' signals_plus = kw.get('SigMultiP', {}).get('signals_plus', []) if not isinstance(signals_plus, list): signals_plus = [signals_plus] signals_minus = kw.get('SigMultiM', {}).get('signals_minus', []) if not isinstance(signals_minus, list): signals_minus = [signals_minus] features = kw.get('FeatMulti', {}).get('features', []) if not isinstance(features, list): features = [features] sptracks = [ track(sig, chrmeta=assembly) for sig in signals_plus if os.path.exists(sig) ] smtracks = [ track(sig, chrmeta=assembly) for sig in signals_minus if os.path.exists(sig) ] ftracks = [ track(feat, chrmeta=assembly) for feat in features if os.path.exists(feat) ] snames = [t.name for t in sptracks + smtracks + ftracks] if len(sptracks) > 0: chrmeta = sptracks[0].chrmeta elif len(smtracks) > 0: chrmeta = smtracks[0].chrmeta elif len(features) > 0: chrmeta = ftracks[0].chrmeta else: raise ValueError("No data provided") if assembly in [x[0] for x in genrep.GenRep().assemblies_available()]: chrnames = genrep.Assembly(assembly).chrnames else: chrnames = [ x[1] for x in sorted([(v['length'], c) for c, v in chrmeta.iteritems()], reverse=True) ] pdf = self.temporary_path(fname='genome_graph.pdf') _fs = ['chr', 'start', 'end', 'score'] _ff = ['chr', 'start', 'end', 'name'] genomeGraph([(c, chrmeta[c]['length']) for c in chrnames], [sig.read(fields=_fs) for sig in sptracks], [sig.read(fields=_fs) for sig in smtracks], [feat.read(fields=_ff) for feat in ftracks], output=pdf, new=True, last=True, legend=snames) self.new_file(pdf, 'genome_graph') return self.display_time()
def __call__(self, **kw): fasta_file = kw.get('fastafile') background = kw.get('background') or None assembly_id = kw.get('assembly') or None regions_file = kw.get('regions') or None motifs_list = kw.get('motifs') motif_add = kw.get('customMotif') threshold = float(kw.get('threshold') or 0) if motifs_list is None: motifs_list = [] if isinstance(motifs_list, basestring): motifs_list = motifs_list.split("|") if not isinstance(motifs_list, list): motifs_list = [motifs_list] if background is None and assembly_id is None: background = self.temporary_path(fname='background.txt') stats = {'A': 0.25,'C': 0.25, 'G': 0.25, 'T': 0.25} if fasta_file: with execution(None) as ex: stats = fasta_composition(ex,fasta_file,frequency=True) with open(background,"w") as bgr: bgr.write(" ".join(["1"]+[str(stats[n]) for n in 'ACGT'])) if assembly_id is not None: assembly = genrep.Assembly(assembly_id) else: if regions_file is not None: raise ValueError("Please specify an assembly if you specify regions.") regions_file = os.path.abspath(regions_file) assembly = None motifs = {} if motif_add is not None: mname = os.path.basename(os.path.splitext(motif_add)[0]) if mname: motifs[mname] = os.path.abspath(motif_add) for mot in motifs_list: gid, mname = mot.split(' ') pwmfile = self.temporary_path() g.get_motif_PWM(int(gid), mname, output=pwmfile) motifs[mname] = pwmfile if len(motifs) == 0: raise ValueError("Please give at least one motif to scan for") track_output = self.temporary_path(fname='motif_scan', ext="bed") with execution(None) as ex: save_motif_profile( ex, motifs, assembly, regions_file, fasta_file, background=background, threshold=threshold, output=track_output, description=None, via='local' ) self.new_file(track_output, 'motif_track') return self.display_time()
def _get_chrmeta(self, chrmeta=None): """:param chrmeta: (str or dict) assembly name, or dict of the type {chr: {'length': 1234}}.""" if isinstance(chrmeta, dict): return chrmeta if isinstance(chrmeta, basestring) and not (str(chrmeta) == "guess"): self.assembly = chrmeta if self.assembly is None: return {} from bbcflib import genrep if genrep.GenRep().assemblies_available(self.assembly): self.assembly = genrep.Assembly(self.assembly) return self.assembly.chrmeta else: self.assembly = None return {}
def test_map_chromosomes(self): stream = fstream([('chrIV', 1), ('IV', 2), (2780, 3), ('NC_001136.9', 4), ('sth', 5)], fields=['chr', 'start']) assembly = genrep.Assembly('sacCer2') res = list(map_chromosomes(stream, assembly.chromosomes, keep=True)) expected = [('chrIV', 1), ('chrIV', 2), ('chrIV', 3), ('chrIV', 4), ('sth', 5)] self.assertListEqual(res, expected) # keep=False stream = fstream([('chrIV', 1), ('IV', 2), (2780, 3), ('NC_001136.9', 4), ('sth', 5)], fields=['chr', 'start']) res = list(map_chromosomes(stream, assembly.chromosomes, keep=False)) self.assertListEqual(res, expected[:-1])
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def __call__(self, **kw): assembly = genrep.Assembly(kw.get('assembly')) chrmeta = assembly.chrmeta or "guess" with open(kw['table'], "rb") as f: h = f.readline().strip().replace('#', '').split('\t') colnames = [] for i in kw['id_columns'].split(','): indice = int(i) - 1 if indice <= len( h) and indice > 2: #columns 0,1,2 are for chr,start,end colnames.append(h[indice]) t = track(kw['table'], chrmeta=chrmeta, fields=h) (filepath, filename) = os.path.split(kw['table']) (shortname, extension) = os.path.splitext(filename) outfiles = [] for _f in colnames: output_name = self.temporary_path(fname=shortname + '_' + _f, ext=kw.get('format', "bedGraph")) out_track = track(output_name, chrmeta=chrmeta) s = t.read(fields=['chr', 'start', 'end', _f]) s.fields[3] = "score" out_track.write(s, mode='write') out_track.close() outfiles.append(output_name) # print outfiles if len(outfiles) > 1: tar_name = self.temporary_path(fname=shortname + "_out.tgz") tar = tarfile.open(tar_name, "w:gz") [tar.add(f, arcname=os.path.basename(f)) for f in outfiles] tar.close() self.new_file(tar_name, 'output_tar') else: self.new_file(outfiles[0], 'output') return self.display_time()
def __call__(self, **kw): assembly = genrep.Assembly(kw.get('assembly')) format = kw['format'] if kw['feature_type'] == 'genes': map = assembly.get_gene_mapping() get_info = self.genes_annot elif kw['feature_type'] == 'exons': map = assembly.get_exon_mapping() get_info = self.exons_annot elif kw['feature_type'] == 'transcripts': map = assembly.get_transcript_mapping() get_info = self.trans_annot def _annotate(ids_list): with open(ids_list) as ids_file: for id in ids_file: id = id.strip() if map.get(id): yield get_info(id, map.get(id)) else: yield ('NA', '0', '0', id, 0.0, '0') ids_list = kw.get('ids_list') fields = ['chr', 'start', 'end', 'name', 'score', 'strand'] if ids_list: assert os.path.exists( str(ids_list)), "File not found: '%s'" % ids_list fulltrack = FeatureStream(_annotate(ids_list), fields=fields) fname = os.path.splitext(os.path.basename(ids_list))[0] else: fulltrack = FeatureStream((get_info(g, map[g]) for g in map), fields=fields) fname = kw['feature_type'] output = self.temporary_path(fname=fname + '.' + format) out = track(output, chrmeta=assembly) out.write(fulltrack) self.new_file(output, 'fulltrack') return self.display_time()
def test_intersect(self): # Test from the snp workflow. expected = ('chr', 91143, 91144, ('C', '*A', '0', '|EBMYCG00000002479|Rv0083', 1, 0)) a = genrep.Assembly('mycoTube_H37RV') c = concat_fields(a.annot_track('CDS', 'chr'), infields=['name', 'strand', 'frame'], as_tuple=True) feat = fstream([('chr', 91143, 91144, ('C', '*A', '0'))], fields=['chr', 'start', 'end', 'rest']) g = intersect([feat, c], win_size=10000) self.assertEqual(g.next(), expected) fields = ['chr', 'start', 'end', 'name', 'strand', 'score'] s1 = fstream([('chr', 0, 20, 'a1', 1, 6.), ('chr', 40, 60, 'b', 1, 3.)], fields=fields) s2 = fstream([('chr', 10, 30, 'a2', 1, 8.), ('chr', 50, 70, 'b', -1, 4.)], fields=fields) res = list(intersect([s1, s2])) expected = [('chr', 10, 20, 'a1|a2', 1, 14.), ('chr', 50, 60, 'b|b', 0, 7.)] self.assertListEqual(res, expected)
def setUp(self): self.assembly = genrep.Assembly('mm9') self.job = fakejob(self.assembly) stranded = False self.args = ("local", self.job, self.assembly, ["KO.1", "KO.2"], sys.stderr, sys.stdout, "genes", False, stranded)
def setUp(self): self.a = genrep.Assembly('sacCer2')
def setUp(self): self.assembly = genrep.Assembly('ce6') """
def main(): try: # Parse args parser = optparse.OptionParser(usage=usage, description=descr) for opt in opts: parser.add_option(opt[0],opt[1],help=opt[2],**opt[3]) # Get variables (opt, args) = parser.parse_args() if opt.assembly: assembly_id = re.search('([._\-\w]+)', str(opt.assembly)).groups()[0] genrep_root = os.path.abspath(opt.root) genrep_url = normalize_url(opt.url) if opt.output: fout = open(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'w') else: fout = sys.stdout regions = None if opt.regions: if os.path.exists(opt.regions): regions = opt.regions else: regions = [] for x in str(opt.regions).split(","): chrom,start,end = re.search('(\S+):(\d+)\-(\d+)',x).groups()[0:3] regions.append([chrom,int(start),int(end)]) # Program body g_rep = genrep.GenRep(url=genrep_url, root=genrep_root) if opt.assembly: assembly = genrep.Assembly(assembly=assembly_id,genrep=g_rep,intype=opt.intype) if opt.list: if opt.assembly: table = ["\t".join((v['ac'],k,str(v['length']))) for k,v in assembly.chrmeta.iteritems()] fout.write("\n".join(table)+"\n") else: fout.write("\n".join(v[1] for v in g_rep.assemblies_available())+"\n") return 0 if not(opt.assembly): parser.print_help() return 0 if regions: seq = assembly.fasta_from_regions(regions=regions, out=fout)[0] if opt.bowtie: fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie index prefix\n") fout.write(assembly.index_path+"\n") if opt.bowtie2: fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie2 index prefix\n") fout.write(re.sub(r'bowtie/','bowtie2/',assembly.index_path)+"\n") if opt.fasta: fout.write(">"+str(assembly.id)+":"+assembly.name+" fasta file\n") fout.write(assembly.fasta_path()+"\n") if opt.db: fout.write(">"+str(assembly.id)+":"+assembly.name+" sqlite file\n") fout.write(assembly.sqlite_path+"\n") if opt.genes: if os.path.exists(opt.genes): glist = _parse_list(opt.genes) else: glist = opt.genes.split(",") for gcoord in assembly.gene_coordinates(glist): fout.write("\t".join([str(x) for x in gcoord])+"\n") if opt.all: from bbcflib.track import track if opt.intype == 1: feats = assembly.exon_track() elif opt.intype == 2: feats = assembly.transcript_track() else: feats = assembly.gene_track() with track(fout,format='bed',fields=['strand']) as _tfeat: _tfeat.write(feats) if opt.stats: stats = assembly.statistics(frequency=True) bases = ["A","C","G","T"] fout.write("#Assembly: %s\n" % assembly.name) [fout.write("%s\t%s\n" % (x,stats[x])) for x in bases] fout.write("#N\t%s\n" % stats["N"] ) [[fout.write("%s\t%s\n" % (x+y,stats[x+y])) for y in bases] for x in bases] fout.close() if opt.convert: if not(os.path.exists(opt.convert)): raise Usage("No such file: %s."%opt.convert) if not(opt.output): raise Usage("Need an output file name.") import pysam infile = pysam.Samfile( opt.convert ) header = infile.header chromosomes = dict((v['ac'],k) for k,v in assembly.chrmeta.iteritems()) for h in header["SQ"]: if h["SN"] in chromosomes: h["SN"] = chromosomes[h["SN"]] outfile = pysam.Samfile(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'wb', header=header ) for read in infile: outfile.write(read) outfile.close() infile.close() return 0 except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, usage return 2
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti', {}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = { 'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti', {}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [ x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom)) ] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def __call__(self, **kw): if kw.get('input_type') == 'Table': filename = kw.get('table') assert os.path.exists( str(filename)), "File not found: '%s'" % filename robjects.r(""" Mdata = read.delim('%s',row.names=1) conds = sapply(strsplit(colnames(Mdata),".",fixed=T),"[[",1) """ % filename) conds = robjects.r("conds").rx() else: from QuantifyTable import QuantifyTablePlugin assembly = genrep.Assembly(kw.get('assembly')) chrmeta = assembly.chrmeta or "guess" kw['score_op'] = 'sum' signals1 = kw['Group1']['signals1'] signals2 = kw['Group2']['signals2'] if not isinstance(signals1, (list, tuple)): signals1 = [signals1] if not isinstance(signals2, (list, tuple)): signals2 = [signals2] signals = signals1 + signals2 kw['SigMulti'] = { 'signals': signals } # to pass it to QuantifyTable plugin table = QuantifyTablePlugin().quantify(**kw) stracks = [] norm_factors = [] for sig in signals: assert os.path.exists( str(sig)), "Signal file not found: '%s'." % sig _t = track(sig, chrmeta=chrmeta) if 'normalization' in _t.info: _nf = float(_t.info['normalization']) elif 'nreads' in _t.info: _nf = float(_t.info['nreads']) * 1e-7 / float( _t.info.get('read_extension', 1)) else: _nf = 1 stracks.append(_t) norm_factors.append(_nf) t = track(table,chrmeta=chrmeta,fields=['chr','start','end','name']+ \ ['score%d'%x for x in range(len(signals))]) _f = [f for f in t.fields if f.startswith('score')] de_list = list(t.read(fields=['name'] + _f)) t.close() os.remove(table) # Turn all scores into integers de_matrix = numpy.asarray([[ int(float(s) * norm_factors[k] + .5) for k, s in enumerate(x[1:]) ] for x in de_list], dtype=numpy.float) rownames = numpy.asarray([x[0] for x in de_list]) colnames = numpy.asarray([s.name for s in stracks]) # if all prefixes are identical within a group, keep this prefix as group identifier. if len(list(set( [x.split('.')[0] for x in colnames[:len(signals1)]] ))) == 1 \ and len(list(set( [x.split('.')[0] for x in colnames[len(signals1):]] ))) == 1: group1 = colnames[0].split('.')[0] group2 = colnames[-1].split('.')[0] else: group1 = "Group1" group2 = "Group2" conds = [group1] * len(signals1) + [group2] * len(signals2) robjects.r.assign('Mdata', numpy2ri(de_matrix)) robjects.r.assign('row_names', robjects.StrVector(rownames)) robjects.r.assign('col_names', robjects.StrVector(colnames)) robjects.r.assign('conds', robjects.StrVector(conds)) robjects.r(""" Mdata = as.data.frame(Mdata,row.names=row_names) colnames(Mdata) = col_names """) robjects.r(""" library(DESeq) if (all(table(conds)>=3)){ # if >3 replicates in all conditions method = 'per-condition' # for each group estimate the variance from its replicates sharingMode = 'gene-est-only' # use the per-gene variance estimates only } else if (any(table(conds)>1)){ # if few replicates method = 'pooled' # use all groups with replicates to estimate the variance sharingMode = 'maximum' # use the max of the GLM fit and the estimated variance } else { # if no replicates method = 'blind' # pools all groups together to estimate the variance sharingMode='fit-only' # use only the GLM fit across the pooled variance } cds = newCountDataSet(Mdata, conds) cds = estimateSizeFactors(cds) test = try({ cds = estimateDispersions(cds, method=method, fitType='parametric', sharingMode=sharingMode) }) if(class(test) == "try-error") { cds = estimateDispersions(cds, method=method, fitType='local', sharingMode=sharingMode) } """) groups = list(set(conds)) couples = itertools.combinations(groups, 2) output = self.temporary_path(fname='DE') for c in couples: out = "%s_%s-%s.txt" % ((output, ) + tuple(c)) robjects.r(""" res = nbinomTest(cds, '%s', '%s') write.table(res[order(res[,8]),], '%s', row.names=F, quote=F, sep='\t') """ % (c[0], c[1], out)) if kw.get('complete') is None: clean = self.clean_deseq_output(out, c) shutil.move(clean, out) self.new_file(out, 'differential_expression') return self.display_time()
def __call__(self,opts): self.opts = opts if os.path.exists(self.opts.wdir): os.chdir(self.opts.wdir) else: raise Usage("Working directory '%s' does not exist." %self.opts.wdir) ##### Connect to Minilims, recover global variables, fetch job info self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims") M = MiniLIMS(self.minilims) if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))): raise Usage("Need a job key or a configuration file") if self.opts.key: self.globals = use_pickle(M, "global variables") htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] ) self.job = htss.job( self.opts.key ) [M.delete_execution(x) for x in \ M.search_executions(with_description=self.opts.key,fails=True)] if self.job.options.get("config_file"): if os.path.exists(self.job.options["config_file"]): self.opts.config = os.path.abspath(self.job.options["config_file"]) elif os.path.exists("config.txt"): self.opts.config = os.path.abspath("config.txt") if self.opts.config and os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals ) elif os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config ) self.opts.key = self.job.description else: raise Usage("Need either a job key (-k) or a configuration file (-c).") ##### Genrep instance if 'fasta_file' in self.job.options: if os.path.exists(self.job.options['fasta_file']): self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path']) else: for ext in (".fa",".fa.gz",".tar.gz"): if os.path.exists("ref_sequence"+ext): self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext) if not os.path.exists(self.job.options['fasta_file']): raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"]) g_rep = genrep.GenRep( url=self.globals.get("genrep_url"), root=self.globals.get("bwt_root") ) ##### Configure facility LIMS if 'lims' in self.globals: from bbcflib import daflims self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'], password=pwd )) for loc,pwd in self.globals['lims']['passwd'].iteritems()) ######################################################################## ########################## EXECUTION ################################# ######################################################################## ##### Logging logfile_name = os.path.abspath(self.opts.key+".log") debugfile_name = os.path.abspath(self.opts.key+".debug") self.logfile = open(logfile_name,'w') self.debugfile = open(debugfile_name,'w') self.debug_write(json.dumps(self.globals)+"\n") with execution( M, description=self.opts.key, remote_working_directory=self.opts.wdir ) as ex: self.log_write("Enter execution. Current working directory: %s" %ex.working_directory) self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id, genrep=g_rep, fasta=self.job.options.get('fasta_file'), annot=self.job.options.get('annot_file'), intype=self.job.options.get('input_type_id',0), ex=ex, via=self.opts.via, bowtie2=self.job.options.get("bowtie2",True) ) ##### Check all the options if not self.check_options(): raise Usage("Problem with options %s" %self.opts) self.debug_write(json.dumps(self.job.options)) self.init_files( ex ) ##### Run workflow self.log_write("Starting workflow.") self.main_func(ex,**self.main_args) ##### Add logs to the LIMS in admin mode self.logfile.flush() self.debugfile.flush() log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin") debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin") ex.add(os.path.join(logfile_name), description=log_desc) ex.add(os.path.join(debugfile_name), description=debug_desc) ##### Create GDV project if self.job.options['create_gdv_project']: self.gdv_create(ex) ######################################################################## ######################## POSTPROCESSING ############################## ######################################################################## allfiles = get_files( ex.id, M ) if self.job.options['create_gdv_project'] and \ self.job.options['gdv_project'].get('project',{}).get('id',0)>0: allfiles['url'] = self.gdv_upload(allfiles.get('sql',{})) self.logfile.close() self.debugfile.close() print json.dumps(allfiles) with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done) self.send_email() return 0
#!/usr/bin/env python from bbcflib import genrep import os, getopt, sys opts = dict(getopt.getopt(sys.argv[1:], "d:", [])[0]) basepath = opts.get('-d') or "/data/epfl/bbcf/genrep/nr_assemblies" basepath += "/%s" for _a, info in genrep.GenRep().assemblies_available(): for n in range(100): assembly = genrep.Assembly(_a) gtf_path = os.path.join(basepath % "gtf", "%s_%i.gtf.gz" % (assembly.md5, n)) if not (assembly.bbcf_valid and os.path.exists(gtf_path)): break sql_path = os.path.join(basepath % "annot_tracks", "%s_%i.sql" % (assembly.md5, n)) if os.path.exists(sql_path): continue print info, gtf_path, sql_path assembly.gtf_to_sql(gtf_path=gtf_path, sql_path=sql_path)
def run(**kwargs): """ Wrapper function to execute any operation contained in this package, directly from file inputs. Arguments are: :param operation: (str) the name of the function to be called. :param output: (str) a filename or a directory to write the results into. :param assembly: (str) a genome assembly identifier if needed. :param chromosome: (str) a chromosome name if operation must be restricted to a single chromsome. :param ...: additional parameters passed to `operation`. Example:: run(operation="score_by_feature", output="score_output.bed", chromosome="chr1", trackScores="density_file.sql", trackFeatures="genes.sql") """ from bbcflib import genrep def _map(fct): for module in _module_list: __import__(_here + module) smod = sys.modules[_here + module] if hasattr(getattr(smod, module)(), fct): return module return None funct = kwargs.pop("operation", 'None') module = _map(funct) if module is None: raise ValueError("No such operation %s." % funct) output = kwargs.pop("output", "./") or "./" if os.path.isdir(output): output = os.path.join(output, unique_filename_in(output) + ".sql") format = "sql" else: format = os.path.splitext(output)[1][1:] or "sql" if format in ['gz', 'gzip']: format = os.path.splitext( output.strip("." + format))[1][1:] + "." + format smod = sys.modules[_here + module] trackSet = {} for targ in getattr(smod, module)().loadable(funct): trackSet[targ] = [track(t) for t in kwargs[targ].split(",")] assembly = None if 'assembly' in kwargs: assembly = kwargs.pop('assembly') if assembly: chrmeta = genrep.Assembly(assembly).chrmeta else: chrmeta = trackSet[targ][0].chrmeta if 'chromosome' in kwargs: chrom = kwargs.pop('chromosome') chrmeta = {chrom: chrmeta.get(chrom, {})} chr = chrmeta.keys()[0] info = None if 'datatype' in kwargs: info = {'datatype': kwargs.pop('datatype')} files = None for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) if isinstance(funct_output, list): files = [] for n, stream in enumerate(funct_output): outf = "%s_%i.%s" % (output.strip(format), n, format) files.append(outf) fields = stream.fields track(outf, chrmeta=chrmeta, fields=fields, info=info).write(stream, chrom=chr) for chr in chrmeta.keys()[1:]: for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) for n, stream in enumerate(funct_output): track(files[n], chrmeta=chrmeta).write(stream, chrom=chr, mode='append') else: files = output fields = funct_output.fields track(files, chrmeta=chrmeta, fields=fields, info=info).write(funct_output, chrom=chr) for chr in chrmeta.keys()[1:]: for targ in getattr(smod, module)().loadable(funct): kwargs[targ] = [t.read(selection=chr) for t in trackSet[targ]] funct_output = getattr(smod, funct)(**kwargs) track(files, chrmeta=chrmeta).write(funct_output, chrom=chr, mode='append') return files