def get_unbound_inputs(cls, cfg): """ Get the unbound inputs """ cfg = cls.load_cfg(cfg) dag = cls.create_dag(cfg) # Step parameters uinputs = defaultdict(dict) for stepname, classname in cfg['dag']['nodes'].iteritems(): step = Step.create(classname) input_keys = step.keys('inputs', req_only=True) if input_keys: for pred in dag.predecessors(stepname): # Remove any key that is already bound for binding in dag[pred][stepname].get('bindings', []): key = binding.split('.')[1] #maybe it has been already removed if key in input_keys: input_keys.remove(key) if input_keys: uinputs[stepname] = input_keys return uinputs
def create_steps(cfg): stepobjs = {} if 'sys_path' in cfg: sys.path.insert(0, cfg['sys_path']) for stepname, classname in cfg['dag']['nodes'].iteritems(): stepobjs[stepname] = Step.create(classname) if 'sys_path' in cfg: del sys.path[0] return stepobjs
def get_metainfo(self, step_name): """ Return a dictionary with generic information about pipeline and step """ info = {} info['pipeline'] = { 'name': self.name, 'version': self.__version__ } info['user'] = { 'login': self.user, 'fullname': pwd.getpwnam(self.user).pw_gecos } step_class = self.dag.node[step_name]['class_name'] stepobj = Step.create(step_class) info['step'] = { 'name': step_name, 'class': step_class, 'version': stepobj.__version__ } return info
def get_refgenomes(cls, cfg, unbound=None): """ Return a 2 level dictionary containing the path of the reference genome grouped by labels. A label is a combination of species, version and variation { "label1": { "stepname1" : { "input_key1" : "/path1"}, "stepname2" : { "input_key1" : "/path2"} } "label2": { "stepname1" : { "input_key1 : "/path3"}, "stepname2" : { "input_key1" : "/path4"} } } The "unbound" dictionary contains the steps that have unbound inputs: if set, only those steps will be considered """ refs = defaultdict(dict) tools = set() # Collect all tools that require a ref. genome for stepname, classname in cfg['dag']['nodes'].iteritems(): if unbound == None or stepname in unbound: step = Step.create(classname) for ref in step.get_refgenome_tools(): tools.add(ref['tool']) refs[stepname][ref['name']] = ref['tool'] # Get corresponding ref genomes refs_by_label = {} for ref in mongo.get_refgenomes(tools): label = "%s %s" % (ref['_id']['species'], ref['_id']['version']) if 'variation' in ref['_id']: label += " (%s)" % ref['_id']['variation'] for stepname in refs: if not label in refs_by_label: refs_by_label[label] = {} refs_by_label[label][stepname] = {} for param_key in refs[stepname]: tool = refs[stepname][param_key] if tool in ref['paths']: refs_by_label[label][stepname][param_key] = ref[ 'paths'][tool] return refs_by_label
def get(self, run_id): """ Return the dag of the given run """ pipeline = db.pipelines.find_one({'run_id': run_id}, {'config': 1, 'file_registry':1}) file_registry = pipeline.get('file_registry', []) if file_registry: file_registry = json.loads(file_registry) conf_str = json.loads(pipeline['config']) config = Pipeline.load_cfg(conf_str); result_steps = config.get('config', {}).get('pipeline', {}).get('results', []) delete_steps = config.get('config', {}).get('pipeline', {}).get('delete', []) delete_steps.append('finalize') delete_steps.append('inputs') steps = list(db.steps.find( {"run_id":run_id, "name": {"$nin": delete_steps}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}}, {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1})) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = [] for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type':'file'}): for i, filename in enumerate(job['outputs'][key]): output = { 'path': filename } if not isinstance(filename, list): output['archived'] = (filename in file_registry) else: output['archived'] = False output_files.append(output) if output_files: outputs[step['name']] = defaultdict(list) outputs[step['name']]['archive'] = step['name'] in result_steps outputs[step['name']]['dir'] = step.get('outputs', {}).get('output_dir') outputs[step['name']]['files'] = copy.deepcopy(output_files) return outputs
def get_refgenomes(cls, cfg, unbound=None): """ Return a 2 level dictionary containing the path of the reference genome grouped by labels. A label is a combination of species, version and variation { "label1": { "stepname1" : { "input_key1" : "/path1"}, "stepname2" : { "input_key1" : "/path2"} } "label2": { "stepname1" : { "input_key1 : "/path3"}, "stepname2" : { "input_key1" : "/path4"} } } The "unbound" dictionary contains the steps that have unbound inputs: if set, only those steps will be considered """ refs = defaultdict(dict) tools = set() # Collect all tools that require a ref. genome for stepname, classname in cfg['dag']['nodes'].iteritems(): if unbound==None or stepname in unbound: step = Step.create(classname) for ref in step.get_refgenome_tools(): tools.add(ref['tool']) refs[stepname][ref['name']] = ref['tool'] # Get corresponding ref genomes refs_by_label = {} for ref in mongo.get_refgenomes(tools): label = "%s %s" % (ref['_id']['species'], ref['_id']['version']) if 'variation' in ref['_id']: label += " (%s)" % ref['_id']['variation'] for stepname in refs: if not label in refs_by_label: refs_by_label[label] = {} refs_by_label[label][stepname] = {} for param_key in refs[stepname]: tool = refs[stepname][param_key] if tool in ref['paths']: refs_by_label[label][stepname][param_key] = ref['paths'][tool] return refs_by_label
def get_metainfo(self, step_name): """ Return a dictionary with generic information about pipeline and step """ info = {} info['pipeline'] = {'name': self.name, 'version': self.__version__} info['user'] = { 'login': self.user, 'fullname': pwd.getpwnam(self.user).pw_gecos } step_class = self.dag.node[step_name]['class_name'] stepobj = Step.create(step_class) info['step'] = { 'name': step_name, 'class': step_class, 'version': stepobj.__version__ } return info
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get(step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} }} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name]['class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join(self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join(self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def validate_config(cls, cfg, user): """ Check if all the config params are ok """ retval = defaultdict(dict) s_errors = defaultdict(dict) #try: cfg = cls.load_cfg(cfg) params = cls.get_params(cfg) unb_inputs = cls.get_unbound_inputs(cfg) #validate step section for stepname in params['steps']: if stepname is not 'inputs': classname = cfg['dag']['nodes'][stepname] stepobj = Step.create(classname) if stepname in cfg['config']['steps']: required_keys = [] required_keys.extend(unb_inputs.get(stepname, [])) required_keys.extend(stepobj.keys(['params'], req_only=True)) stepcfg = cfg['config']['steps'][stepname] for key in required_keys: if key in stepcfg: param_spec = stepobj.key_spec(key) error_msg = stepobj.validate_value(stepcfg[key], param_spec['type'], param_spec['name']) if error_msg: s_errors[stepname][key] = error_msg else: s_errors[stepname][key] = 'missing value' else: for key in stepobj.keys(['params'], req_only=True): s_errors[stepname][key] = 'missing value' if stepname in unb_inputs: for key in unb_inputs[stepname]: s_errors[stepname][key] = 'missing value' #validate pipeline section p_errors = {} if not cfg['config']['pipeline']['project_name']: p_errors['project_name'] = 'missing value' if not cfg['config']['pipeline']['description']: p_errors['description'] = 'missing value' if not cfg['config']['pipeline']['output_dir']: p_errors['output_dir'] = 'missing value' else: output_dir = cfg['config']['pipeline']['output_dir'] if not output_dir.startswith('/'): p_errors['output_dir'] = '%s : not an absolute path' % output_dir if not isinstance(output_dir, basestring): p_errors['output_dir'] = '%s : invalid type, found %s, expected %s' % (output_dir, type(output_dir), 'str') #elif not ut.has_write_access(output_dir): # p_errors['output_dir'] = '%s : not writable by user' % (output_dir) if s_errors: retval['steps'] = s_errors if p_errors: retval['pipeline'] = p_errors return retval
def post(self, run_id): """ Pushes files into iRODS """ data = request.get_json(force=True) runmeta = data.get('meta') selection = data.get('selection') user = auth_get_username(request.authorization, data.get('user')) npdis = dbmodel.get_npdi_projects() npdi = runmeta.get('Project NPDI ID', '') study_nickname = runmeta.get('Study nickname', 'Required field missing') if (npdi + study_nickname) not in npdis: return { 'pipeline': { 'Project': '%s (%s)' % (npdi, study_nickname) } }, 400 run = db.pipelines.find_one({'run_id': run_id}, { 'meta': 1, 'run_id': 1 }) steps_names = selection.keys() steps = list( db.steps.find( { "run_id": run_id, "name": { '$in': steps_names }, "jobs": { "$elemMatch": { "outputs": { "$exists": True } } } }, { "name": 1, "jobs": 1, "outputs.output_dir": 1, "step_config": 1 })) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = {} for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type': 'file'}): for i, filename in enumerate( job['outputs'][key]): filemeta = { 'step': step['name'], 'job_id': job_id } ext = os.path.splitext( filename)[1][1:].upper() for key in job.get('meta', {}): meta = job['meta'][key] if key == 'sample_id': okey = 'Operational sample accession' else: okey = key if isinstance(meta, list): filemeta[okey] = meta[i] else: filemeta[okey] = meta filemeta[ 'File type'] = 'Processed data file' filemeta['File format'] = ext output_files[filename] = filemeta if output_files: outputs[step['name']] = output_files input_files = [] meta_data = [] for step_name, step_selection in selection.iteritems(): for filepath in step_selection: input_files.append(filepath) filemeta = outputs[step_name][filepath] filemeta.update(runmeta) meta_data.append(filemeta) cfg = Pipeline.load_cfg(pipeline_specs['irods_lz']) cfg['config']['steps']['irods_mvtolz'] = { 'input_files': input_files, 'meta_data': meta_data } cfg['config']['steps']['irods_monitorlz'] = { 'prun_id': run['run_id'] } cfg['config']['pipeline']['project_name'] = run['meta'][ 'project_name'] cfg['config']['pipeline'][ 'description'] = 'Archive data for run %s' % run['run_id'] cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods' # Get id from DB db_info = dbmodel.PipelineDb(cfg['name'], cfg, Pipeline.ordered_steps(cfg), user) cfg['run_id'] = db_info.run_id ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" % (cfg['label'], cfg['run_id'], user)) return pm.add_pipeline(cfg, user)
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get( step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta': { 'pipeline': {}, 'step': {}, 'job': {} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name][ 'class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join( self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join( self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [ step_config[to_key] ] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][ key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def validate_config(cls, cfg, user): """ Check if all the config params are ok """ retval = defaultdict(dict) s_errors = defaultdict(dict) #try: cfg = cls.load_cfg(cfg) params = cls.get_params(cfg) unb_inputs = cls.get_unbound_inputs(cfg) #validate step section for stepname in params['steps']: if stepname is not 'inputs': classname = cfg['dag']['nodes'][stepname] stepobj = Step.create(classname) if stepname in cfg['config']['steps']: required_keys = [] required_keys.extend(unb_inputs.get(stepname, [])) required_keys.extend( stepobj.keys(['params'], req_only=True)) stepcfg = cfg['config']['steps'][stepname] for key in required_keys: if key in stepcfg: param_spec = stepobj.key_spec(key) error_msg = stepobj.validate_value( stepcfg[key], param_spec['type'], param_spec['name']) if error_msg: s_errors[stepname][key] = error_msg else: s_errors[stepname][key] = 'missing value' else: for key in stepobj.keys(['params'], req_only=True): s_errors[stepname][key] = 'missing value' if stepname in unb_inputs: for key in unb_inputs[stepname]: s_errors[stepname][key] = 'missing value' #validate pipeline section p_errors = {} if not cfg['config']['pipeline']['project_name']: p_errors['project_name'] = 'missing value' if not cfg['config']['pipeline']['description']: p_errors['description'] = 'missing value' if not cfg['config']['pipeline']['output_dir']: p_errors['output_dir'] = 'missing value' else: output_dir = cfg['config']['pipeline']['output_dir'] if not output_dir.startswith('/'): p_errors[ 'output_dir'] = '%s : not an absolute path' % output_dir if not isinstance(output_dir, basestring): p_errors[ 'output_dir'] = '%s : invalid type, found %s, expected %s' % ( output_dir, type(output_dir), 'str') #elif not ut.has_write_access(output_dir): # p_errors['output_dir'] = '%s : not writable by user' % (output_dir) if s_errors: retval['steps'] = s_errors if p_errors: retval['pipeline'] = p_errors return retval
def post(self, run_id): """ Pushes files into iRODS """ data = request.get_json(force=True) runmeta = data.get('meta') selection = data.get('selection') user = auth_get_username(request.authorization, data.get('user')) npdis = dbmodel.get_npdi_projects() npdi = runmeta.get('Project NPDI ID', '') study_nickname = runmeta.get('Study nickname', 'Required field missing') if (npdi + study_nickname) not in npdis: return {'pipeline': { 'Project': '%s (%s)' %(npdi, study_nickname) }}, 400 run = db.pipelines.find_one({'run_id': run_id}, {'meta':1, 'run_id':1}) steps_names = selection.keys() steps = list(db.steps.find( {"run_id":run_id, "name": {'$in': steps_names}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}}, {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1})) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = {} for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type':'file'}): for i, filename in enumerate(job['outputs'][key]): filemeta = {'step': step['name'], 'job_id': job_id} ext = os.path.splitext(filename)[1][1:].upper() for key in job.get('meta', {}): meta = job['meta'][key] if key == 'sample_id': okey = 'Operational sample accession' else: okey = key if isinstance(meta, list): filemeta[okey] = meta[i] else: filemeta[okey] = meta filemeta['File type'] = 'Processed data file' filemeta['File format'] = ext output_files[filename] = filemeta if output_files: outputs[step['name']] = output_files input_files = [] meta_data = [] for step_name, step_selection in selection.iteritems(): for filepath in step_selection: input_files.append(filepath) filemeta = outputs[step_name][filepath] filemeta.update(runmeta) meta_data.append(filemeta) cfg = Pipeline.load_cfg(pipeline_specs['irods_lz']) cfg['config']['steps']['irods_mvtolz'] = { 'input_files' : input_files, 'meta_data' : meta_data } cfg['config']['steps']['irods_monitorlz'] = { 'prun_id' : run['run_id'] } cfg['config']['pipeline']['project_name'] = run['meta']['project_name'] cfg['config']['pipeline']['description'] = 'Archive data for run %s' %run['run_id'] cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods' # Get id from DB db_info = dbmodel.PipelineDb(cfg['name'], cfg, Pipeline.ordered_steps(cfg), user) cfg['run_id'] = db_info.run_id ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" % (cfg['label'], cfg['run_id'], user)) return pm.add_pipeline(cfg, user)
def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"): """ Read in the pipeline graph and load the configuration. """ self.all_ok = True self.user = user self.status = JOB_STATUS.QUEUED self.lock = '' self.completed = [] self.running = {} self.outputs = {} self.schedname = schedname db_model_name = "MONGO_DB" if db else "STUB_DB" # Load configuration self.one_step = False try: self.cfg = Pipeline.load_cfg(cfg) except Exception as e1: print('Failed to load config as pipeline (error=%s). Trying as step' % e1) try: self.cfg = Step.load_cfg(cfg) self.step = Step.load_step(self.cfg) self.one_step = True except Exception as e2: Exception("Unable to load config file %s:\n" \ "pipeline load: %s\n" \ "step load: %s" % (cfg, e1, e2)) # Set all additional information self.run_id = self.cfg.get('run_id') if self.one_step: self.name = self.step.name self.label = self.step.name self.project_name = self.cfg.get('project_name', '') self.description = self.cfg.get('description', '') self.output_dir = self.step.output_dir self.ordered = [self.step.name] else: self.name = self.cfg['name'] self.label = self.cfg['label'] self.project_name = self.cfg['config']['pipeline'].get('project_name', '') self.description = self.cfg['config']['pipeline'].get('description', '') self.output_dir = self.cfg['config']['pipeline']['output_dir'] if not self.output_dir.startswith('/scratch'): self.cfg['dag']['nodes'][FINAL_STEP] = 'utils.Finalize' #TODO: Make it work for one_step as well self.ordered = Pipeline.ordered_steps(self.cfg) self.sys_path = self.cfg.get('sys_path') if self.sys_path: sys.path.insert(0, self.sys_path) self.dag = self.create_dag(self.cfg, one_step=self.one_step) self.meta = { 'pipeline': { 'label': self.label, 'project_name': self.project_name, 'descr': self.description, 'run_id': self.run_id }, 'steps': {}, 'job' : {} } self.db = db_models[db_model_name](self.name, self.cfg, self.ordered, self.user, output_dir=self.output_dir) if hasattr(self.db, 'run_id'): self.run_id = self.db.run_id self.cfg['run_id'] = self.run_id # Define the output directories if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.) # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run; # and c) the user-defined directory is not already under /scratch if self.run_id and not (self.name == 'demultiplexing'): dirname = '%s_%d' % (self.name, self.db.run_id) self.output_dir = os.path.join(self.output_dir, dirname) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # In case of /scratch, do not create an additional sub-directory if self.output_dir.startswith('/scratch'): self.work_dir = self.output_dir else: self.work_dir = os.path.join(WORK_DIR, self.user, dirname) if not os.path.exists(self.work_dir): os.makedirs(self.work_dir, 0775) symlink = os.path.join(self.output_dir, 'work_area') if not os.path.exists(symlink): os.symlink(self.work_dir, symlink) else: self.work_dir = self.output_dir ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' % (self.output_dir, self.work_dir)) self.db.update_pipeline(self.run_id, {'output_dir': self.output_dir, 'work_dir': self.work_dir })
def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"): """ Read in the pipeline graph and load the configuration. """ self.all_ok = True self.user = user self.status = JOB_STATUS.QUEUED self.lock = '' self.completed = [] self.running = {} self.outputs = {} self.schedname = schedname db_model_name = "MONGO_DB" if db else "STUB_DB" # Load configuration self.one_step = False try: self.cfg = Pipeline.load_cfg(cfg) except Exception as e1: print( 'Failed to load config as pipeline (error=%s). Trying as step' % e1) try: self.cfg = Step.load_cfg(cfg) self.step = Step.load_step(self.cfg) self.one_step = True except Exception as e2: Exception("Unable to load config file %s:\n" \ "pipeline load: %s\n" \ "step load: %s" % (cfg, e1, e2)) # Set all additional information self.run_id = self.cfg.get('run_id') if self.one_step: self.name = self.step.name self.label = self.step.name self.project_name = self.cfg.get('project_name', '') self.description = self.cfg.get('description', '') self.output_dir = self.step.output_dir self.ordered = [self.step.name] else: self.name = self.cfg['name'] self.label = self.cfg['label'] self.project_name = self.cfg['config']['pipeline'].get( 'project_name', '') self.description = self.cfg['config']['pipeline'].get( 'description', '') self.output_dir = self.cfg['config']['pipeline']['output_dir'] if not self.output_dir.startswith('/scratch'): self.cfg['dag']['nodes'][ FINAL_STEP] = 'utils.Finalize' #TODO: Make it work for one_step as well self.ordered = Pipeline.ordered_steps(self.cfg) self.sys_path = self.cfg.get('sys_path') if self.sys_path: sys.path.insert(0, self.sys_path) self.dag = self.create_dag(self.cfg, one_step=self.one_step) self.meta = { 'pipeline': { 'label': self.label, 'project_name': self.project_name, 'descr': self.description, 'run_id': self.run_id }, 'steps': {}, 'job': {} } self.db = db_models[db_model_name](self.name, self.cfg, self.ordered, self.user, output_dir=self.output_dir) if hasattr(self.db, 'run_id'): self.run_id = self.db.run_id self.cfg['run_id'] = self.run_id # Define the output directories if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.) # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run; # and c) the user-defined directory is not already under /scratch if self.run_id and not (self.name == 'demultiplexing'): dirname = '%s_%d' % (self.name, self.db.run_id) self.output_dir = os.path.join(self.output_dir, dirname) if not os.path.exists(self.output_dir): os.makedirs(self.output_dir, 0775) # In case of /scratch, do not create an additional sub-directory if self.output_dir.startswith('/scratch'): self.work_dir = self.output_dir else: self.work_dir = os.path.join(WORK_DIR, self.user, dirname) if not os.path.exists(self.work_dir): os.makedirs(self.work_dir, 0775) symlink = os.path.join(self.output_dir, 'work_area') if not os.path.exists(symlink): os.symlink(self.work_dir, symlink) else: self.work_dir = self.output_dir ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' % (self.output_dir, self.work_dir)) self.db.update_pipeline(self.run_id, { 'output_dir': self.output_dir, 'work_dir': self.work_dir })
def get(self, run_id): """ Return the dag of the given run """ pipeline = db.pipelines.find_one({'run_id': run_id}, { 'config': 1, 'file_registry': 1 }) file_registry = pipeline.get('file_registry', []) if file_registry: file_registry = json.loads(file_registry) conf_str = json.loads(pipeline['config']) config = Pipeline.load_cfg(conf_str) result_steps = config.get('config', {}).get('pipeline', {}).get('results', []) delete_steps = config.get('config', {}).get('pipeline', {}).get('delete', []) delete_steps.append('finalize') delete_steps.append('inputs') steps = list( db.steps.find( { "run_id": run_id, "name": { "$nin": delete_steps }, "jobs": { "$elemMatch": { "outputs": { "$exists": True } } } }, { "name": 1, "jobs": 1, "outputs.output_dir": 1, "step_config": 1 })) outputs = {} for step in steps: if step.get('step_config', {}): s = Step.load_step(step['step_config']) output_files = [] for job_id, job in enumerate(step['jobs']): for key in job['outputs']: if key in s.keys(key_groups='outputs', key_filter={'type': 'file'}): for i, filename in enumerate( job['outputs'][key]): output = {'path': filename} if not isinstance(filename, list): output['archived'] = (filename in file_registry) else: output['archived'] = False output_files.append(output) if output_files: outputs[step['name']] = defaultdict(list) outputs[step['name']]['archive'] = step[ 'name'] in result_steps outputs[step['name']]['dir'] = step.get( 'outputs', {}).get('output_dir') outputs[step['name']]['files'] = copy.deepcopy( output_files) return outputs