def update_metadata(self, step_name, step_meta): """ Store step metadata (if any) and pull out global metadata from it """ self.meta['steps'][step_name] = step_meta modified = False if 'pipeline' in step_meta: ut.dict_update(self.meta['pipeline'], step_meta['pipeline']) #self.log.debug('Pulled metadata from step: %s' % ut.format_dict(self.meta)) self.db.update_pipeline_metadata(copy.deepcopy(self.meta['pipeline'])) self.db.update_step_metadata(step_name, copy.deepcopy(self.meta['steps'][step_name]['step']))
def update_metadata(self, step_name, step_meta): """ Store step metadata (if any) and pull out global metadata from it """ self.meta['steps'][step_name] = step_meta modified = False if 'pipeline' in step_meta: ut.dict_update(self.meta['pipeline'], step_meta['pipeline']) #self.log.debug('Pulled metadata from step: %s' % ut.format_dict(self.meta)) self.db.update_pipeline_metadata( copy.deepcopy(self.meta['pipeline'])) self.db.update_step_metadata( step_name, copy.deepcopy(self.meta['steps'][step_name]['step']))
def __init__(self): self.bootstrap = STARTUP_CYCLE self.status = JOB_STATUS.QUEUED self.meta = {'pipeline': {}, 'step': {}, 'job': {}} self.requirements = {'memory': '1', 'cpus': '1'} self.output_dir = '.' self.jobs = OrderedDict() self.cmd_count = 0 logger.set_stdout_level(logger.DEBUG) self.log = logger.get_log() # parse specs and create keys self.spec["name"] = self.__module__.replace('nespipe.steps.', '').split('.')[-1] self.name = self.spec["name"] self.__version__ = self.spec['version'] self.local_step = self.spec.get('local', False) global scheduler if self.local_step: self.scheduler = get_scheduler("SCHED_LOCAL") else: self.scheduler = scheduler for k, v in self.spec["args"].iteritems(): for param in v: if param.get('name', None): setattr(self, param['name'], param.get('value', [])) ut.dict_update( self.requirements, self.spec.get('requirements', { 'memory': '1', 'cpus': '1' })) for k, v in self.requirements.iteritems(): setattr(self, k, int(v)) #set the jvm memory if 'memory' in self.requirements: self.jvm_memory = int(int(self.requirements['memory']) * 0.9) if not self.jvm_memory: self.jvm_memory = 1
def __init__(self): self.bootstrap = STARTUP_CYCLE self.status = JOB_STATUS.QUEUED self.meta = { 'pipeline':{}, 'step':{}, 'job':{}} self.requirements = {'memory' : '1', 'cpus' : '1'} self.output_dir = '.' self.jobs = OrderedDict() self.cmd_count = 0 logger.set_stdout_level(logger.DEBUG) self.log = logger.get_log() # parse specs and create keys self.spec["name"] = self.__module__.replace('nespipe.steps.','').split('.')[-1] self.name = self.spec["name"] self.__version__ = self.spec['version'] self.local_step = self.spec.get('local', False) global scheduler if self.local_step: self.scheduler = get_scheduler("SCHED_LOCAL") else: self.scheduler = scheduler for k, v in self.spec["args"].iteritems(): for param in v: if param.get('name', None): setattr(self, param['name'], param.get('value', [])) ut.dict_update(self.requirements, self.spec.get('requirements', {'memory' : '1', 'cpus' : '1'})) for k, v in self.requirements.iteritems(): setattr(self, k, int(v)) #set the jvm memory if 'memory' in self.requirements: self.jvm_memory = int(int(self.requirements['memory']) * 0.9) if not self.jvm_memory: self.jvm_memory = 1
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get(step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} }} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name]['class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join(self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join(self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def load_cfg(cls, cfg): """ Return the json cfg Is expecting as input one between a file, a json text or a dictionary """ cfg_load = None try: if type(cfg) == dict: cfg_load = copy.deepcopy(cfg) elif isinstance(cfg, basestring): if os.path.exists(cfg): with open(cfg) as fh: cfg_load = json.load(fh) if 'sys_path' not in cfg_load: cfg_load['sys_path'] = os.path.dirname(os.path.realpath(cfg)) else: cfg_load = json.load(cfg) except Exception as e: raise Exception("Unable to load config file %s: %s" % (cfg, e)) else: #load the spec_type or spec_file into the json_spec #if they exists cfg_data = { 'config' : {'steps': {}, 'pipeline' : {'project_name' : '', 'description' : '', 'output_dir': ''}}} ut.dict_update(cfg_data, cfg_load) if 'sys_path' in cfg_data: sys.path.insert(0, cfg_data['sys_path']) pipeline_to_load = cfg_data['dag'].pop("load") if "load" in cfg_data['dag'] else None if pipeline_to_load: try: if os.path.exists(pipeline_to_load): spec_file = pipeline_to_load else: if pipeline_to_load in pipeline_names: spec_file = pipeline_names[pipeline_to_load] else: raise Exception("Pipeline %s not found in list of pipelines: [%s]" % (pipeline_to_load, ','.join(pipeline_names))) with open(spec_file) as fh: ut.pretty_print("Loading pipeline spec from %s" % spec_file) spec = json.load(fh) stepobjs = Pipeline.create_steps(spec) steps_defaults = {} for step in stepobjs: step_default = stepobjs[step].keys_values(['params', 'requirements']) if step_default: steps_defaults[step] = step_default spec.setdefault('config', {}) spec['config'].setdefault('pipeline', {}) spec['config'].setdefault('steps', {}) ut.dict_update(spec['config']['steps'], steps_defaults, replace=False) ut.dict_update(spec['config'], cfg_data.get('config', '')) cfg_data = spec except: raise if cfg_data.get('config', {}).get('pipeline', {}).get('refgenome',{}): key_refgenome = cfg_data['config']['pipeline'].pop('refgenome') try: ref_genomes = Pipeline.get_refgenomes(cfg_data) if key_refgenome in ref_genomes: # set refgenome parameters in each step (update config if already exists) for step in ref_genomes[key_refgenome]: if step in cfg_data['config']['steps']: cfg_data['config']['steps'][step].update(ref_genomes[key_refgenome][step]) else: cfg_data['config']['steps'][step] = ref_genomes[key_refgenome][step] else: raise Exception("unable to load ref genome paths for %s " % key_refgenome) except Exception, e: raise if 'sys_path' in cfg_data: del sys.path[0] return cfg_data
def run_step(self, step_name): """ Configure and run a job for the given step """ #skip the input step if step_name == 'inputs': self.completed.append(step_name) self.outputs[step_name] = self.cfg['config']['steps'].get( step_name, {}) self.outputs[step_name]['output_dir'] = '' self.db.update_step_status(step_name, JOB_STATUS.RUNNING) self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED) self.db.set_step_outputs(step_name, self.outputs[step_name]) else: if self.one_step: step_config = self.cfg step_config['sys_path'] = self.sys_path step_config['output_dir'] = self.output_dir step_config['meta'] = { 'meta': { 'pipeline': {}, 'step': {}, 'job': {} } } ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) elif step_name == FINAL_STEP: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_config['name'] = FINAL_STEP step_config['step_class'] = self.dag.node[step_name][ 'class_name'] step_config['target_dir'] = self.output_dir step_config['source_dir'] = self.work_dir step_config['output_dir'] = os.path.join( self.work_dir, step_name) self.configure_finalstep(step_config) else: step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}} ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline']) step_class = self.dag.node[step_name]['class_name'] step_config['name'] = step_name step_config['sys_path'] = self.sys_path step_config['step_class'] = step_class step_config['output_dir'] = os.path.join( self.work_dir, step_name) # 1. Form input keys # Remember: edges are labelled by 'from' keys for pred in self.dag.predecessors(step_name): edge = self.dag[pred][step_name] # Not an actual loop: just get key/value for bind_to, bind_from in edge.get('bindings', {}).iteritems(): to_key = bind_to.split('.')[1] if hasattr(bind_from, '__iter__'): for from_key in bind_from: key = from_key.split('.')[1] out = self.outputs[pred][key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [ step_config[to_key] ] step_config[to_key].extend(out) else: step_config[to_key] = out else: from_key = bind_from.split('.')[1] out = self.outputs[pred][from_key] if to_key in step_config: if isinstance(step_config[to_key], basestring): step_config[to_key] = [step_config[to_key]] step_config[to_key].extend(out) else: step_config[to_key] = out # Transfer metadata of previous step to next step for key in self.meta['steps'].get(pred, {}): step_config['meta'][key] = self.meta['steps'][pred][ key] # 2. Form step config. if not self.one_step: ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False) if step_name == FINAL_STEP: # final step: pass full pipeline metadata step_config['meta'].update(self.meta) else: self.update_metadata(step_name, step_config[KEY_META]) # 3. Submit step self.log.info('Executing step %s' % str(step_name)) self.log.debug(' step configuration:\n %s' % ut.format_dict(step_config, indent=4)) self.log.info(' step %s queued ' % str(step_name)) self.running[step_name] = Step.load_step(step_config) job_counter = self.running[step_name].distribute() self.db.start_step(step_name, step_config, job_counter)
def load_cfg(cls, cfg): """ Return the json cfg Is expecting as input one between a file, a json text or a dictionary """ cfg_load = None try: if type(cfg) == dict: cfg_load = copy.deepcopy(cfg) elif isinstance(cfg, basestring): if os.path.exists(cfg): with open(cfg) as fh: cfg_load = json.load(fh) if 'sys_path' not in cfg_load: cfg_load['sys_path'] = os.path.dirname( os.path.realpath(cfg)) else: cfg_load = json.load(cfg) except Exception as e: raise Exception("Unable to load config file %s: %s" % (cfg, e)) else: #load the spec_type or spec_file into the json_spec #if they exists cfg_data = { 'config': { 'steps': {}, 'pipeline': { 'project_name': '', 'description': '', 'output_dir': '' } } } ut.dict_update(cfg_data, cfg_load) if 'sys_path' in cfg_data: sys.path.insert(0, cfg_data['sys_path']) pipeline_to_load = cfg_data['dag'].pop( "load") if "load" in cfg_data['dag'] else None if pipeline_to_load: try: if os.path.exists(pipeline_to_load): spec_file = pipeline_to_load else: if pipeline_to_load in pipeline_names: spec_file = pipeline_names[pipeline_to_load] else: raise Exception( "Pipeline %s not found in list of pipelines: [%s]" % (pipeline_to_load, ','.join(pipeline_names))) with open(spec_file) as fh: ut.pretty_print("Loading pipeline spec from %s" % spec_file) spec = json.load(fh) stepobjs = Pipeline.create_steps(spec) steps_defaults = {} for step in stepobjs: step_default = stepobjs[step].keys_values( ['params', 'requirements']) if step_default: steps_defaults[step] = step_default spec.setdefault('config', {}) spec['config'].setdefault('pipeline', {}) spec['config'].setdefault('steps', {}) ut.dict_update(spec['config']['steps'], steps_defaults, replace=False) ut.dict_update(spec['config'], cfg_data.get('config', '')) cfg_data = spec except: raise if cfg_data.get('config', {}).get('pipeline', {}).get('refgenome', {}): key_refgenome = cfg_data['config']['pipeline'].pop('refgenome') try: ref_genomes = Pipeline.get_refgenomes(cfg_data) if key_refgenome in ref_genomes: # set refgenome parameters in each step (update config if already exists) for step in ref_genomes[key_refgenome]: if step in cfg_data['config']['steps']: cfg_data['config']['steps'][step].update( ref_genomes[key_refgenome][step]) else: cfg_data['config']['steps'][ step] = ref_genomes[key_refgenome][step] else: raise Exception( "unable to load ref genome paths for %s " % key_refgenome) except Exception, e: raise if 'sys_path' in cfg_data: del sys.path[0] return cfg_data