def setUp(self): templated.template_dir=os.path.normpath(os.path.abspath(__file__)+"/../../fixtures/templates") RnaseqGlobals.initialize(__file__, testing=True) readset_file=RnaseqGlobals.root_dir()+'/t/fixtures/readsets/paired1.syml' self.readset=Readset.load(filename=readset_file)[0] self.pipeline=Pipeline(name='test_newstep', readset=self.readset)
def get_pipeline(self,**kwargs): session=RnaseqGlobals.get_session() use_template=RnaseqGlobals.conf_value('use_template') found=False assert(kwargs['name']) assert(kwargs['readset']) db_pipeline=session.query(Pipeline).filter_by(name=kwargs['name']).first() found=db_pipeline!=None if use_template or not found: # build pipeline using template t_pipeline=Pipeline(name=kwargs['name'], readset=kwargs['readset']).load() if found: if use_template: # replace existing template with newly generated one: assert(db_pipeline.id != None) session.delete(db_pipeline) session.commit() session.add(t_pipeline) session.commit() pipeline=t_pipeline else: pipeline=db_pipeline pipeline.readset=kwargs['readset'] else: # found==False t_pipeline.template_file() # sets pipeline.path session.add(t_pipeline) session.commit() pipeline=t_pipeline assert(hasattr(pipeline,'readset')) pipeline.set_defaults() return pipeline
def setUp(self): usage="" RnaseqGlobals.initialize(usage) self.db_file=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'),RnaseqGlobals.conf_value('testing','test_db')) readset=Readset(name='readset', db_file=self.db_file).load() self.readset=readset if not readset.table_exists(): raise ProgrammerGoof("table %s doesn't exist" % readset.tablename())
def setUp(self): usage="testing: "+__file__ RnaseqGlobals.initialize(usage) self.db_file=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'),RnaseqGlobals.conf_value('testing','test_db')) readset=Readset(name='readset', db_file=self.db_file).load() self.readset=readset #print "readset is %s" % readset readset.execute(sql) print "table %s created" % readset.tablename()
def setUp(self): templated.template_dir=os.path.normpath(os.path.abspath(__file__)+"/../../fixtures/templates") RnaseqGlobals.initialize(__file__, testing=True) readset_file=os.path.join(RnaseqGlobals.root_dir(),'t','fixtures','readsets','readset1.syml') self.readset=Readset.load(readset_file)[0] self.pipeline=Pipeline(name='juan', readset=self.readset).load_steps() session=RnaseqGlobals.get_session() ps=session.query(Pipeline).all() for p in ps: session.delete(p) session.commit()
def make_run_objects(self, session): #self=self.store_db() try: verbose=os.environ['DEBUG'] except: debug=False # create the pipeline_run object: try: label=RnaseqGlobals.conf_value('label') or self.readset.label except AttributeError as ae: raise UserError("No label defined. Please specify a label for the pipeline run, either in the readset or using the '--label' command line option") pipeline_run=PipelineRun(status='standby', input_file=', '.join(self.readset.reads_files), user=RnaseqGlobals.conf_value('user'), label=label, working_dir=self.readset.working_dir) self.pipeline_runs.append(pipeline_run) # print "mro: self.pipeline_runs is %s" % ", ".join(str(x.id) for x in self.pipeline_runs) # try: warn("pipeline.id is %s" % pipeline.id) # except: warn("pipeline has no id") self=session.merge(self) session.commit() if pipeline_run.id==None: raise ProgrammerGoof("no id in %s" % pipeline_run) self.context.pipeline_run_id=pipeline_run.id RnaseqGlobals.set_conf_value('pipeline_run_id',pipeline_run.id) # create step_run objects: step_runs={} for step in self.steps: if step.is_prov_step: continue step_run=StepRun(step_name=step.name, status='standby') for output in step.output_list(): output=evoque_template(output, step, self.readset) step_run.file_outputs.append(FileOutput(path=output)) if step.skip: # as set by self.set_steps_current() if debug: print "step %s is current, skipping" % step.name step_run.status='skipped' step_run.success=True pipeline_run.step_runs.append(step_run) session.commit() pipeline_run.step_runs.append(step_run) # maintains list in db as well step_runs[step.name]=step_run self.context.step_runs[step.name]=step_run session.commit() return (pipeline_run, step_runs)
def verify_exes(self): dirs=RnaseqGlobals.conf_value('rnaseq', 'path').split(":") dirs.extend([os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'),'programs')]) errors=[] for step in self.steps: if not step.verify_exe(): errors.append("Missing executable in step %s: %s" %(step.name, step.exe)) if len(errors)>0: errors.append("Please link these executables from the %s/programs directory, or make sure they are on the path defined in the config file." \ % RnaseqGlobals.conf_value('rnaseq', 'root_dir')) return errors
def setUp(self): usage="" RnaseqGlobals.initialize(usage) self.db_file=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'),RnaseqGlobals.conf_value('testing','test_db')) readset=Readset(name='readset', db_file=self.db_file).load() self.readset=readset #print "readset is %s" % readset sql="DROP TABLE IF EXISTS %s" % readset.tablename() readset.execute(sql) sql="CREATE TABLE %s (id INTEGER PRIMARY KEY AUTOINCREMENT, name VARCHAR[255], description TEXT)" % readset.tablename() readset.execute(sql)
def __init__(self,**args): dict_like.__init__(self,**args) try: self.db_file=args['db_file'] except KeyError: a=RnaseqGlobals.conf_value('rnaseq', 'root_dir') b=RnaseqGlobals.conf_value('db','db_name') if (a==None or b==None): raise ProgrammerGoof("RnaseqGlobals not initialized") self.db_file=os.path.join(a,b) self.connect() # should this really be called in the constructor? self.cursor=self.dbh.cursor() assert(self.columns)
def setUp(self): RnaseqGlobals.initialize(__file__, testing=True) templated.template_dir=RnaseqGlobals.root_dir()+"/t/fixtures/templates" readset_file=RnaseqGlobals.root_dir()+"/t/fixtures/readsets/readset1.syml" self.readset=Readset.load(readset_file)[0] self.pipeline=Pipeline(name='juan', readset=self.readset).load_steps() session=RnaseqGlobals.get_session() # delete all pre-existing pipeline objects from the db: plist=session.query(Pipeline) for p in plist: session.delete(p) session.commit()
def verify_exe(self): if not hasattr(self,'exe'): return True dir_list=RnaseqGlobals.conf_value('rnaseq', 'path').split(":") dir_list.extend([os.path.join(RnaseqGlobals.root_dir(),'programs')]) if exists_on_path(self.exe, dir_list, os.X_OK): return True # didn't find executable directly, see if there's an interpreter: if hasattr(self,'interpreter'): return exists_on_path(self.interpreter, dir_list, os.X_OK) and \ exists_on_path(self.exe, dir_list, os.R_OK) # couldn't find self.exe, no self.interpreter: return False
def usage(self, context): if self.aligner=='bowtie': bowtie_index=RnaseqGlobals.conf_value('rnaseq','bowtie_indexes') if self.paired_end(): script=''' export BOWTIE_INDEXES=%(bowtie_index)s bowtie ${ewbt} -1 ${inputs[0]} -2 ${inputs[1]} ${args} | perl -lane 'print unless($$F[1] == 4)' > $${ID}.${name}_BAD.$${format} ''' % {'bowtie_index': bowtie_index} else: script=''' export BOWTIE_INDEXES=%(bowtie_index)s bowtie ${ewbt} ${args} ${inputs[0]} | perl -lane 'print unless($$F[1] == 4)' > $${ID}.${name}_BAD.$${format} ''' % {'bowtie_index': bowtie_index} restore_indent=True elif self.aligner=='blat': # fixme: need to implement this (NYI) raise ProgrammerGoof("step %s doesn't work for aligner==blat yet (NYI)" % self.name) else: raise ConfigError("Unknown alignment program '%s'" % self.aligner) return script
def sh_cmdline(self): try: usage=self['usage'] if usage==None: usage='' except KeyError: usage='' # look for exe in path, unless exe is an absolute path try: if os.path.abspath(self['exe'])!=self['exe']: self['exe']=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'), 'programs', self['exe']) except KeyError as ae: # not all steps have self['exe']; eg header, footer pass try: return usage % self # fixme: you don't really know what you're doing in these except blocks... except KeyError as e: raise ConfigError("Missing value %s in\n%s" % (e.args, self.name)) except AttributeError as e: raise ConfigError("Missing value %s in\n%s" % (e.args, self.name)) except ValueError as e: warn(e) warn("%s.usage: %s" % (self.name,usage)) raise "%s.keys(): %s" % (self.name, ", ".join(self.__dict__.keys())) except TypeError as te: raise ConfigError("step %s: usage='%s': %s" % (self.name, usage, te))
def sh_script(self, **kwargs): if 'sh_template' in self.dict: template_dir=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'),"templates","sh_template") domain=Domain(template_dir, errors=4) sh_template=self['sh_template'] template=domain.get_template(sh_template) vars={} vars.update(self) vars.update(self.dict) vars['readset']=self.pipeline.readset # fixme: really? vars['sh_cmd']=self.sh_cmdline() vars['config']=RnaseqGlobals.config vars['pipeline']=self.pipeline vars['ID']=self.pipeline.ID() vars.update(kwargs) #print vars try: script=template.evoque(vars) return script except NameError as ne: raise ConfigError("%s while processing step '%s'" %(ne,self.name)) else: return None
def is_current(self): if self.force: return False latest_input=0 earliest_output=time.time() for input in self.inputs(): try: mtime=os.stat(input).st_mtime except OSError as ose: return False # missing/unaccessible inputs constitute not being current if mtime > latest_input: latest_input=mtime try: exe_file=os.path.join(RnaseqGlobals.conf_value('rnaseq','root_dir'), 'programs', self['exe']) exe_mtime=os.stat(exe_file).st_mtime if exe_mtime > latest_input: latest_input=exe_mtime except OSError as oe: raise ConfigError("%s: %s" %(exe_file, oe)) for output in self.outputs(): try: stat_info=os.stat(output) if (stat_info.st_mtime < earliest_output): earliest_output=stat_info.st_mtime except OSError as ose: return False # missing/unaccessible outputs definitely constitute not being current #print "final: latest_input is %s, earliest_output is %s" % (latest_input, earliest_output) return latest_input<earliest_output
def sh_script(self, context, **args): if 'echo_name' in args and args['echo_name']: echo_part="\n# step %s:\n" % self.name echo_part+="echo step %s 1>&2" % self.name else: echo_part='' try: usage=self.usage(context) except KeyError as ke: raise ConfigError("missing config item '%s'" % ke) # check for missing attrs after calling step.usage() missing_attrs=self.missing_required_attrs() if len(missing_attrs) > 0: raise ConfigError("step %s: missing attributes: %s" % (self.name, ', '.join(missing_attrs))) vars={} vars.update(self.__dict__) vars.update(self.pipeline.readset) if not self.is_prov_step: vars.update(self.pipeline[self.name]) vars['inputs']=context.inputs[self.name] vars['outputs']=context.outputs[self.name] vars['pipeline']=self.pipeline vars['pipeline_run_id']=context.pipeline_run_id #vars['step_run_id']=context.step_runs[self.name].id #vars['next_step_run_id']=context.step_runs[self.pipeline.step_after(step.name)].id vars['config']=RnaseqGlobals.config vars['readset']=self.pipeline.readset # need to add shell variables for 'set': (in cufflinks.s_?.sh scripts) # currently root_dir, programs, reads_file, ID, format, readlen # but really, the pipeline should specify these? # or only things that are truly universal vars['root_dir']=RnaseqGlobals.root_dir() # add readset exports: readset=self.pipeline.readset for attr in readset.exports: try: vars[attr]=getattr(readset, attr) except AttributeError: vars[attr]='' #warn("%s.sh_script: no '%s' readset attribute!" % (self.name, attr)) # add self.exports: try: export_list=self.exports except: export_list=[] for attr in export_list: vars[attr]=getattr(self,attr) try: script_part=evoque_template(usage, vars) except Exception as e: raise ConfigError("step %s: %s" % (self.name, e)) script="\n".join([echo_part,script_part]) # tried using echo_part+sh_script, got weird '>' -> '>' substitutions return script
def check_label_unique(self, session, label): other_pr=session.query(PipelineRun).filterBy(label=label).first() if other_pr: if RnaseqGlobals.conf_value('force'): session.delete(other_pr) # delete existing run, will get over written session.commit() else: raise UserError("The label '%s' is already in use.\n Please provide a new label (either in the readset or by use of the '--label' command line option), or use the '--force' option to fully override the old pipeline run. \n This will cause all steps to be run, also." % label)
def setUp(self): templated.template_dir=os.path.normpath(os.path.abspath(__file__)+"/../../fixtures/templates") RnaseqGlobals.initialize(__file__, testing=True) RnaseqGlobals.set_conf_value('force',True) RnaseqGlobals.set_conf_value('silent',True) readset=Readset.load(RnaseqGlobals.root_dir()+'/t/fixtures/readsets/readset1.syml')[0] self.pipeline=Pipeline(name='filter', readset=readset)
def set_ID(self, *ID): # try to assign self.ID from ID[0], which might not be there: try: self.ID=ID[0] except IndexError: pass # see if self.ID exists, and if it does, is it an absolute path. If so, do nothing try: if os.path.isabs(self.ID): pass else: self.ID=os.path.join(self.working_dir, self.ID) # self.ID exists and is relative except AttributeError: # self.ID didn't exist, set to combination of working_dir and basename of reads_file if len(self.reads_files)==1: ID=os.path.join(self.working_dir,os.path.basename(self.reads_file)) ID=re.sub('\..*$', '', ID) self.ID=ID elif len(self.reads_files)==2 and self.paired_end: # check that file names are of proper form: mg=re.search('^(.*)_[12]\.[\w_]+$', os.path.basename(self.reads_files[0])) # works of self.reads_files[0]... error_msg="'%s' isn't a well-formed filename for paired_end data: must match '_[12].<ext>'" % self.reads_files[0] try: self.ID=os.path.join(self.working_dir, mg.groups()[0]) except IndexError: raise ConfigError(error_msg) except AttributeError: raise ConfigError(error_msg) else: if RnaseqGlobals.conf_value('verbose') or RnaseqGlobals.conf_value('debug'): print >>sys.stderr, "Cannot set ID: too many files (%d), paired_end=%s" % (len(self.reads_files), self.paired_end) return self # #self['ID']=self.ID # god dammit # set self.id as ...something. why? self.id=os.path.basename(self.ID) self['id']=self.id return self
def write_sh_script(self, **kwargs): script=self.sh_script(**kwargs) script_filename=os.path.join(self.readset.working_dir, self.scriptname()) try: os.makedirs(self.readset.working_dir) except OSError: pass # already exists, that's ok (fixme: could be permissions error) with open(script_filename, "w") as f: f.write(script) if RnaseqGlobals.conf_value('verbose'): print "%s written" % script_filename return script_filename
def test_list(self): readset_file=RnaseqGlobals.root_dir()+'/t/fixtures/readsets/readset12.syml' rlist=Readset.load(readset_file) self.assertEqual(type(rlist),type([])) self.assertEqual(len(rlist),6) for rs in rlist: self.assertEqual(rs.org, 'mouse') self.assertEqual(rs.readlen, 75) self.assertEqual(os.path.dirname(rs.reads_file), self.readset_dir) self.assertTrue(re.match('s_\d\d?_\d_sequence.txt',os.path.basename(rs.reads_file))) self.assertTrue(re.search(rs.label,os.path.basename(rs.reads_file))) self.assertEqual(rs.label,rs.description)
def test_query(self): session=RnaseqGlobals.get_session() readset=self.readset session.add(self.pipeline) session.commit() pipeline=self.pipeline l=session.query(Pipeline).filter_by(name=pipeline.name).all() self.assertEqual(len(l), 1) pl=l[0] for a in ['name', 'description']: self.assertEqual(getattr(pl,a), getattr(pipeline,a)) self.assertEqual(pl.id,1)
def evoque_fields(self): vars=self.__dict__ vars.update(RnaseqGlobals.conf_value('rnaseq')) for a in dir(self): if a.startswith('__'): continue attr=getattr(self,a) if type(attr) != type(''): continue if not re.search('\$\{', attr): continue try: setattr(self, a, evoque_template(attr, vars)) except NameError: pass return self
def run(self, *argv, **args): session = RnaseqGlobals.get_session() for klass in self.classes: # Drop the table first. Don't try to use Table.drop(), it's a pain if the table doesn't already exist. # engine.execute("DROP TABLE IF EXISTS %s" % tablename) try: ct = getattr(klass, "create_table") except AttributeError as ae: raise UserError("%s doesn't define 'create_table'" % klass.__name__) ct(RnaseqGlobals.metadata, RnaseqGlobals.engine) print "%s created" % klass.__tablename__
def test_glob_rel(self): readset_file=RnaseqGlobals.root_dir()+'/t/fixtures/readsets/readset_rel_glob.syml' rlist=Readset.load(readset_file) self.assertEqual(len(rlist),1) filelist=rlist[0].reads_files self.assertEqual(len(filelist), 3) for i in range(1,3): filename="s_%d_export.txt" % i found=False for f in filelist: if re.search(filename, f): found=True break self.assertTrue(found)
def test_pipeline_id(self): pipeline=self.pipeline try: id=pipeline.id except AttributeError: self.assertTrue(True) session=RnaseqGlobals.get_session() session.add(pipeline) session.commit() session.flush() self.assertEqual(pipeline.id, 1) mpipeline=session.merge(pipeline) self.assertEqual(mpipeline.id, 1)
def store_db(self): session=RnaseqGlobals.get_session() other_self=session.query(Pipeline).filter_by(name=self.name).first() if other_self==None: session.add(self) session.commit() #if RnaseqGlobals.conf_value('debug'): warn("s_db: added pipeline %s: id=%d" % (self.name, self.id)) #if RnaseqGlobals.conf_value('debug'): warn("s_db: %s.path: %s" % (self.name, self.path)) else: #if RnaseqGlobals.conf_value('debug'): warn("s_db: found pipeline %s: id=%d" % (self.name, other_self.id)) self.id=other_self.id self=other_self return self
def test_dict(self): dir=RnaseqGlobals.root_dir()+'/t/fixtures/readsets' os.chdir(dir) filename=os.path.join(dir,'readset_rel_glob.syml') rlist=Readset.load(filename) readset=rlist[0] self.assertRegexpMatches(readset['reads_file'], dir+'/s_\d_export.txt') self.assertEqual(readset['description'],'this is a sample readset (fixture)') self.assertEqual(readset['org'],'mouse') self.assertEqual(readset['readlen'],75) self.assertEqual(readset['working_dir'],os.path.join(dir,'rnaseq_wf')) vars={'this': 'that'} readset.update(vars) self.assertEqual(readset['this'],'that')
def run(self, *argv, **args): try: classname=argv[0][2] # [0] is script name, [1] is command except IndexError: raise UserError(self.usage()) try: klass=globals()[classname] except KeyError: raise UserError("%s: unknown class" % classname) obj_hash={} paired=[p for p in argv[0] if re.match("\w+=\w+", p)] for pair in paired: k,v=re.split("=",pair) obj_hash[k]=v o=klass(obj_hash) session=RnaseqGlobals.get_session() session.add(o) session.commit()
def qsub_script(self, script_filename, out_filename=None, err_filename=None): if out_filename==None: out_filename=self.out_filename() if err_filename==None: err_filename=self.err_filename() qsub=templated(name='qsub', type='sh_template', suffix='tmpl') vars={} vars.update(self.__dict__) vars['name']=path_helpers.sanitize(self.name) vars['cmd']=script_filename vars['out_filename']=out_filename vars['err_filename']=err_filename qsub_script=qsub.eval_tmpl(vars=vars) qsub_script_file=path_helpers.sanitize(os.path.join(self.readset.working_dir, "%s.%s.qsub" % (self.name, self.readset.label))) f=open(qsub_script_file,"w") f.write(qsub_script) f.close() if RnaseqGlobals.conf_value('verbose'): print("%s written" % qsub_script_file) return qsub_script_file