def __init__(self, url=None, config=None, section='frontend'): if url == None and config == None: raise TypeError("Must specify a URL or a configuration.") elif url != None: self.url = normalize_url(url) else: self.url = normalize_url(config.get(section, 'frontend_url'))
def _gdv_request(**kw): req_keys = ['mail','key','project_key','project_id','name','assembly', 'url','fsys','trackname','force','extension','delfile'] request = dict((k,kw[k]) for k in req_keys if k in kw) kw['serv_url'] = normalize_url(kw['serv_url']) url = "/".join([kw[k] for k in ['serv_url','obj','action','id'] if k in kw]) req = urllib2.urlopen(url, urllib.urlencode(request)) if kw.get('return_type','') == 'json': return json.load(req) return req.read()
def _gdv_request(**kw): req_keys = [ 'mail', 'key', 'project_key', 'project_id', 'name', 'assembly', 'url', 'fsys', 'trackname', 'force', 'extension', 'delfile' ] request = dict((k, kw[k]) for k in req_keys if k in kw) kw['serv_url'] = normalize_url(kw['serv_url']) url = "/".join( [kw[k] for k in ['serv_url', 'obj', 'action', 'id'] if k in kw]) req = urllib2.urlopen(url, urllib.urlencode(request)) if kw.get('return_type', '') == 'json': return json.load(req) return req.read()
def gdv_upload(self,files): glg = self.globals['gdv'] project = self.job.options['gdv_project']['project'] download_url = normalize_url(self.globals['hts_'+self.name]['download']) urls_names = dict(("%s/%s" %(download_url,k),re.sub('\.sql.*','',str(f))) for k,f in files.iteritems()) self.log_write("Uploading GDV tracks:\n"+" ".join(urls_names.keys())+"\n"+" ".join(urls_names.values())) try: tr = gdv.multiple_tracks(mail=glg['email'], key=glg['key'], serv_url=glg['url'], project_id=project['id'], extensions=['sql']*len(urls_names), urls=urls_names.keys(), names=urls_names.values(), force=True ) self.debug_write("GDV Tracks Status\n"+"\n".join([str(v) for v in tr])) except Exception, err: self.debug_write("GDV Tracks Failed: %s" %err) pass
def send_email(self): if self.opts.noemail or not('email' in self.globals and hasattr(self.job,"email")): return from bbcflib import email r = email.EmailReport( sender=self.globals['email']['sender'], to=str(self.job.email).split(','), subject="%s job %s" %(self.name,str(self.job.description)), smtp_server=self.globals['email']['smtp'] ) r.appendBody(''' Your %s job has finished. The description was: %s and its unique key is %s You can now retrieve the results at this url: %s/jobs/%s/get_results ''' %(self.name,self.job.description,self.opts.key,normalize_url(self.globals['hts_'+self.name]['url']),self.opts.key)) r.send()
def main(): try: # Parse args parser = optparse.OptionParser(usage=usage, description=descr) for opt in opts: parser.add_option(opt[0],opt[1],help=opt[2],**opt[3]) # Get variables (opt, args) = parser.parse_args() if opt.assembly: assembly_id = re.search('([._\-\w]+)', str(opt.assembly)).groups()[0] genrep_root = os.path.abspath(opt.root) genrep_url = normalize_url(opt.url) if opt.output: fout = open(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'w') else: fout = sys.stdout regions = None if opt.regions: if os.path.exists(opt.regions): regions = opt.regions else: regions = [] for x in str(opt.regions).split(","): chrom,start,end = re.search('(\S+):(\d+)\-(\d+)',x).groups()[0:3] regions.append([chrom,int(start),int(end)]) # Program body g_rep = genrep.GenRep(url=genrep_url, root=genrep_root) if opt.assembly: assembly = genrep.Assembly(assembly=assembly_id,genrep=g_rep,intype=opt.intype) if opt.list: if opt.assembly: table = ["\t".join((v['ac'],k,str(v['length']))) for k,v in assembly.chrmeta.iteritems()] fout.write("\n".join(table)+"\n") else: fout.write("\n".join(v[1] for v in g_rep.assemblies_available())+"\n") return 0 if not(opt.assembly): parser.print_help() return 0 if regions: seq = assembly.fasta_from_regions(regions=regions, out=fout)[0] if opt.bowtie: fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie index prefix\n") fout.write(assembly.index_path+"\n") if opt.bowtie2: fout.write(">"+str(assembly.id)+":"+assembly.name+" bowtie2 index prefix\n") fout.write(re.sub(r'bowtie/','bowtie2/',assembly.index_path)+"\n") if opt.fasta: fout.write(">"+str(assembly.id)+":"+assembly.name+" fasta file\n") fout.write(assembly.fasta_path()+"\n") if opt.db: fout.write(">"+str(assembly.id)+":"+assembly.name+" sqlite file\n") fout.write(assembly.sqlite_path+"\n") if opt.genes: if os.path.exists(opt.genes): glist = _parse_list(opt.genes) else: glist = opt.genes.split(",") for gcoord in assembly.gene_coordinates(glist): fout.write("\t".join([str(x) for x in gcoord])+"\n") if opt.all: from bbcflib.track import track if opt.intype == 1: feats = assembly.exon_track() elif opt.intype == 2: feats = assembly.transcript_track() else: feats = assembly.gene_track() with track(fout,format='bed',fields=['strand']) as _tfeat: _tfeat.write(feats) if opt.stats: stats = assembly.statistics(frequency=True) bases = ["A","C","G","T"] fout.write("#Assembly: %s\n" % assembly.name) [fout.write("%s\t%s\n" % (x,stats[x])) for x in bases] fout.write("#N\t%s\n" % stats["N"] ) [[fout.write("%s\t%s\n" % (x+y,stats[x+y])) for y in bases] for x in bases] fout.close() if opt.convert: if not(os.path.exists(opt.convert)): raise Usage("No such file: %s."%opt.convert) if not(opt.output): raise Usage("Need an output file name.") import pysam infile = pysam.Samfile( opt.convert ) header = infile.header chromosomes = dict((v['ac'],k) for k,v in assembly.chrmeta.iteritems()) for h in header["SQ"]: if h["SN"] in chromosomes: h["SN"] = chromosomes[h["SN"]] outfile = pysam.Samfile(re.search('([._\-\w]+)', str(opt.output)).groups()[0], 'wb', header=header ) for read in infile: outfile.write(read) outfile.close() infile.close() return 0 except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, usage return 2
class Workflow(object): def __init__(self,**kw): self.module = kw.get('module','HTSstation') #### name can be different from module: e.g. c4seq is the python module, #### but the name 4cseq is used in outputs and in variable names self.name = kw.get('name',self.module) self.opts = ( ("-v", "--via", "Run executions locally or remotely (can be 'local' or 'lsf')", {'default': _via}), ("-k", "--key", "Alphanumeric key of the %s job" %self.name, {'default': None}), ("-w", "--working-directory", "Directory to run execution in", {'default': os.getcwd(), 'dest':"wdir"}), ("-c", "--config", "Configuration file", {'default': None}), ("--no-email", "Do not send email", {'action':"store_true", 'dest':"noemail"}), ("--basepath","HTS data basepath (%s MiniLIMS)"%self.name, {'default': _basepath})) self.opts += kw.get("opts",()) self.usage = _usage + str(kw.get('usage','')) self.desc = kw.get('desc',_description) #### By default the workflow will execute the call #### X_workflow(ex,**self.main_args) from bbcflib.X where X is the module name #### Can be overloaded in derived classes __import__('bbcflib.'+self.module) self.sysmod = getattr(sys.modules['bbcflib'],self.module) self.main_func = getattr(sys.modules["bbcflib."+self.module],self.module+"_workflow") self.main_args = {} self.logfile = None self.debugfile = None def __call__(self,opts): self.opts = opts if os.path.exists(self.opts.wdir): os.chdir(self.opts.wdir) else: raise Usage("Working directory '%s' does not exist." %self.opts.wdir) ##### Connect to Minilims, recover global variables, fetch job info self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims") M = MiniLIMS(self.minilims) if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))): raise Usage("Need a job key or a configuration file") if self.opts.key: self.globals = use_pickle(M, "global variables") htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] ) self.job = htss.job( self.opts.key ) [M.delete_execution(x) for x in \ M.search_executions(with_description=self.opts.key,fails=True)] if self.job.options.get("config_file"): if os.path.exists(self.job.options["config_file"]): self.opts.config = os.path.abspath(self.job.options["config_file"]) elif os.path.exists("config.txt"): self.opts.config = os.path.abspath("config.txt") if self.opts.config and os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals ) elif os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config ) self.opts.key = self.job.description else: raise Usage("Need either a job key (-k) or a configuration file (-c).") ##### Genrep instance if 'fasta_file' in self.job.options: if os.path.exists(self.job.options['fasta_file']): self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path']) else: for ext in (".fa",".fa.gz",".tar.gz"): if os.path.exists("ref_sequence"+ext): self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext) if not os.path.exists(self.job.options['fasta_file']): raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"]) g_rep = genrep.GenRep( url=self.globals.get("genrep_url"), root=self.globals.get("bwt_root") ) ##### Configure facility LIMS if 'lims' in self.globals: from bbcflib import daflims self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'], password=pwd )) for loc,pwd in self.globals['lims']['passwd'].iteritems()) ######################################################################## ########################## EXECUTION ################################# ######################################################################## ##### Logging logfile_name = os.path.abspath(self.opts.key+".log") debugfile_name = os.path.abspath(self.opts.key+".debug") self.logfile = open(logfile_name,'w') self.debugfile = open(debugfile_name,'w') self.debug_write(json.dumps(self.globals)+"\n") with execution( M, description=self.opts.key, remote_working_directory=self.opts.wdir ) as ex: self.log_write("Enter execution. Current working directory: %s" %ex.working_directory) self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id, genrep=g_rep, fasta=self.job.options.get('fasta_file'), annot=self.job.options.get('annot_file'), intype=self.job.options.get('input_type_id',0), ex=ex, via=self.opts.via, bowtie2=self.job.options.get("bowtie2",True) ) ##### Check all the options if not self.check_options(): raise Usage("Problem with options %s" %self.opts) self.debug_write(json.dumps(self.job.options)) self.init_files( ex ) ##### Run workflow self.log_write("Starting workflow.") self.main_func(ex,**self.main_args) ##### Add logs to the LIMS in admin mode self.logfile.flush() self.debugfile.flush() log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin") debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin") ex.add(os.path.join(logfile_name), description=log_desc) ex.add(os.path.join(debugfile_name), description=debug_desc) ##### Create GDV project if self.job.options['create_gdv_project']: self.gdv_create(ex) ######################################################################## ######################## POSTPROCESSING ############################## ######################################################################## allfiles = get_files( ex.id, M ) if self.job.options['create_gdv_project'] and \ self.job.options['gdv_project'].get('project',{}).get('id',0)>0: allfiles['url'] = self.gdv_upload(allfiles.get('sql',{})) self.logfile.close() self.debugfile.close() print json.dumps(allfiles) with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done) self.send_email() return 0 ######################################################################## ############################# DONE! ################################## ######################################################################## def log_write(self,mesg): self.logfile.write(mesg+"\n") self.logfile.flush() def debug_write(self,mesg): self.debugfile.write(mesg+"\n") self.debugfile.flush() def check_options(self,more_defs={}): ### { option_name: (default_value, additional_conditions, ...) } defaults = {'compute_densities':(True,), 'ucsc_bigwig':(True,(self.job.assembly.intype == 0)), 'create_gdv_project':(False,)} defaults.update(more_defs) for op,val in defaults.iteritems(): self.job.options.setdefault(op,val[0]) if isinstance(self.job.options[op],basestring): self.job.options[op] = self.job.options[op].lower() in ['1','true','t'] self.job.options[op] &= all(val[1:]) self.job.options['gdv_project'] = {'project':{'id': int(self.job.options.get('gdv_project_id',0))}} self.job.options.setdefault('gdv_key',"") if hasattr(self.opts,"mapseq_minilims") and self.opts.mapseq_minilims: self.mapseq_minilims = self.opts.mapseq_minilims else: self.mapseq_minilims = os.path.join(self.opts.basepath,"mapseq_minilims") self.suffix = ['fwd','rev'] return def init_files(self,ex): """ Default behaviour for most modules: get bam files from mapping.""" from bbcflib.mapseq import get_bam_wig_files msurl = self.globals.get('hts_mapseq',{}).get('url','') scpath = self.globals.get('script_path','') self.job = get_bam_wig_files( ex, self.job, minilims=self.mapseq_minilims, hts_url=msurl, suffix=self.suffix, script_path=scpath, fetch_unmapped=self.main_args.get('unmapped',False), via=self.opts.via ) return True def gdv_create(self,ex): from bbcflib import gdv project = gdv.get_project(mail=self.globals['gdv']['email'], key=self.globals['gdv']['key'], project_key=self.job.options['gdv_key']) if 'error' in project: self.log_write("Creating GDV project.") project = gdv.new_project( self.globals['gdv']['email'], self.globals['gdv']['key'], self.job.description, self.job.assembly.id, self.globals['gdv']['url'] ) self.debug_write("\nGDV project: "+json.dumps(project)) add_pickle( ex, project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) self.job.options['gdv_project'] = project return True def gdv_upload(self,files): glg = self.globals['gdv'] project = self.job.options['gdv_project']['project'] download_url = normalize_url(self.globals['hts_'+self.name]['download']) urls_names = dict(("%s/%s" %(download_url,k),re.sub('\.sql.*','',str(f))) for k,f in files.iteritems()) self.log_write("Uploading GDV tracks:\n"+" ".join(urls_names.keys())+"\n"+" ".join(urls_names.values())) try: tr = gdv.multiple_tracks(mail=glg['email'], key=glg['key'], serv_url=glg['url'], project_id=project['id'], extensions=['sql']*len(urls_names), urls=urls_names.keys(), names=urls_names.values(), force=True ) self.debug_write("GDV Tracks Status\n"+"\n".join([str(v) for v in tr])) except Exception, err: self.debug_write("GDV Tracks Failed: %s" %err) pass gdv_project_url = "%s/public/project?k=%s&id=%s" %(normalize_url(glg['url']),project['download_key'],project['id']) return {gdv_project_url: 'GDV view'}
def main(argv = None): """ Entry point when program start """ genrep = None assembly = None lims = None job = None config = None config_file = None background = "" matrix = "" original_sql_data = "" random_sql_data = "" track_filtered = "" track_scanned = "" project = "" username = "" identity_file = "" host = "" website = "" remote_path = "" result_path = "" track_regions_path = "" via = "" limspath = "" fdr = 0 runs = {} logging.basicConfig(filename='run_scanning.log',level=logging.INFO) if argv is None: argv = sys.argv try: try: opts, args = getopt.getopt ( argv[1:],"hu:c:" , [ "help", "via = ", "host = " , "remote_path = " , "website = " , "minilims = ","config = " , "matrix = ", "username = "******"identity_file = ", "project = " ] ) except getopt.error, msg: raise Usage(msg) for option, value in opts: if option in ("-h", "--help"): print __doc__ print USAGE sys.exit(0) elif option == "--via": if value == "local": via = "local" elif value == "lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (value,)) elif option == "--website": website = normalize_url(value) elif option == "--minilims": limspath = normcase(expanduser(value)) elif option == "--host": host = value elif option == "--identity_file": identity_file = value elif option == "--remote_path": remote_path = normcase(expanduser(value)) if not remote_path.endswith(sep): remote_path += sep elif option == "--matrix": matrix = {basename(value):normcase(expanduser(value))} elif option == "--username": username = value elif option == "--project": project = value elif option in ("-c", "--config"): config_file = normcase(expanduser(value)) else: raise Usage("Unhandled option: " + option) # read config file if config_file is None or not exists(config_file) or not isfile(config_file): raise Usage("Config file missing") else: job, config = parseConfig(normcase(expanduser(config_file))) if project == "": project = job.description if matrix == "": if "matrix" in job.options: path = normcase(expanduser(job.options["matrix"])) matrix = {basename(path): path} else: raise Usage("You need give value matrix file ") if limspath == "": if "minilims" in job.options: limspath = job.options["minilims"] else: raise Usage("You need give value minilims path/name") if via == "": if "via" in job.options: via = job.options["via"] else: via = "lsf" if host == "" and "host" in job.options: host = job.options["host"] if identity_file == "" and "identity_file" in job.options: identity_file = job.options["identity_file"] if remote_path == "" and "remote_path" in job.options: remote_path = job.options["remote_path"] if username == "" and "username" in job.options: username = job.options["username"] if website == "" and "website" in job.options: website = job.options["website"] genrep = GenRep(config = config) assembly = genrep.assembly(job.assembly_id) lims = MiniLIMS(limspath) json = create_gdv_project( config["gdv"]["key"], config["gdv"]["email"], project, assembly.nr_assembly_id, config["gdv"]["url"], public = True ) project_id = get_project_id( json ) # compute false discovery rate with execution(lims, description = job.description) as ex: background = genrep.statistics ( assembly, output = unique_filename_in(), frequency = True, matrix_format = True ) if len(job.groups) >2: raise ValueError("They are more than 2 group in config file") for group_number in job.groups: group = job.groups[group_number] for run_number in group["runs"]: run_iter = job.groups[group_number]["runs"][run_number] if "url" in run_iter: url = run_iter["url"] uri = "" if run_iter["run"] not in runs: runs[run_iter["run"]] = {"name":None, "control":None, "experimental":None} if url.startswith("http") or url.startswith("www."): url = normalize_url(url) # download data data = urllib2.urlopen(url) uri = unique_filename_in() with open(uri, "w") as opening_file: opening_file.write(data.read()) else: uri = normcase(expanduser(url)) if group["control"]: runs[run_iter["run"]]["control"] = uri runs[run_iter["run"]]["name"] = basename(uri) else: runs[run_iter["run"]]["experimental"] = uri for run in runs: current_run = runs[run] original_sql_data = unique_filename_in() random_sql_data = unique_filename_in() track_filtered = unique_filename_in() logging.info( "[%s]" % job.description ) logging.info( "alias %s => %s" % (current_run["experimental"], track_filtered) ) # convert data to sql with Track(current_run["experimental"], chrmeta = assembly.chromosomes) as track: # Get sqlite file if is not arleady in this format if track.format != "sql" or track.format != "db" or track.format != "sqlite": track.convert(original_sql_data, format = "sql") else: original_sql_data = current_run["experimental"] # Generate a random population from orginal if it is not give from config file if current_run["control"] is None: # create random track track.shuffle_track(random_sql_data, repeat_number = 5) else: with Track(current_run["control"], chrmeta = assembly.chromosomes) as track_random: # Get sqlite file if is not arleady in this format if track_random.format != "sql" or \ track_random.format != "db" or \ track_random.format != "sqlite": track_random.convert(random_sql_data, format = "sql") else: random_sql_data = current_run["control"] track_scanned, fdr, p_value = sqlite_to_false_discovery_rate( ex, matrix, background, genrep, assembly.chromosomes, original_sql_data, random_sql_data, threshold = -100, via = via, keep_max_only = False, alpha = 0.05, nb_sample = 5.0 ) # filter track with fdr as treshold with new(track_filtered, format = "sql", datatype = "qualitative") as track_out: chromosome_used = {} track_out.meta_track = {"source": basename(current_run["experimental"])} track_out.meta_track.update({"k":"v"}) with Track(track_scanned, format = "sql", chrmeta = assembly.chromosomes) as track_in: meta = dict([(v["name"], dict([("length", v["length"])])) for v in track_in.chrmeta.values()]) for chromosome in track_in.all_chrs: data_list = [] for data in track_in.read ( {"chr": chromosome, "score": (fdr, sys.maxsize)}, fields = Track.qualitative_fields ): data_list.append(data) chromosome_used[chromosome] = meta[chromosome] if len(data_list) > 0: track_out.write(chromosome, data_list) track_out.chrmeta = chromosome_used ex.add(track_filtered, "sql: filtred %s" % track_filtered) logging.info( "scanned: %s" % track_scanned ) logging.info( "score selected: %f with p: %.3f" % (fdr, p_value) ) logging.info( "filtred: %s" % track_filtered ) # fix track track_scanned_signal = fix_sqlite_db(track_scanned) logging.info( "scanned signal: %s" % track_scanned_signal ) ex.add(track_scanned_signal, description="%s: sql track signal %s" % (job.description, track_scanned_signal)) # send filtred track and scanned track to remote if host != "" and remote_path != "" and username != "": args = [] if identity_file != "": args = ["-i", normcase(expanduser(identity_file)), "-C" ] source_filtred = normcase(expanduser(track_filtered)) source_scanned = normcase(expanduser(track_scanned_signal)) result_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_filtered) result_path = "%s%s%s.db" % (website, sep, track_filtered) track_regions_destination = "%s@%s:%s%s%s.db" % (username, host, remote_path, sep, track_scanned_signal) track_regions_path = "%s%s%s.db" % (website, sep, track_scanned_signal) scp(ex, source_filtred, result_destination, args = args) scp(ex, source_scanned, track_regions_destination, args = args) else: result_path = track_filtered # Send to GDV filtred track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, result_path, name = "filtred_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) # Send to GDV scanned track add_gdv_track ( config["gdv"]["key"], config["gdv"]["email"], project_id, track_regions_path, name = "regions_%s" % (splitext( basename( current_run["experimental"] ) )[0]), gdv_url = config["gdv"]["url"] ) logging.info( "++++++++++++") logging.info( "-------------------END--------------------")