Example #1
0
    def get_unbound_inputs(cls, cfg):
        """
        Get the unbound inputs
        """

        cfg = cls.load_cfg(cfg)
        dag = cls.create_dag(cfg)

        # Step parameters
        uinputs = defaultdict(dict)
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            step = Step.create(classname)
            input_keys = step.keys('inputs', req_only=True)
            if input_keys:
                for pred in dag.predecessors(stepname):
                    # Remove any key that is already bound
                    for binding in dag[pred][stepname].get('bindings', []):
                        key = binding.split('.')[1]
                        #maybe it has been already removed
                        if key in input_keys:
                            input_keys.remove(key)

                if input_keys:
                    uinputs[stepname] = input_keys

        return uinputs
Example #2
0
    def get_unbound_inputs(cls, cfg):
        """
        Get the unbound inputs
        """

        cfg = cls.load_cfg(cfg)
        dag = cls.create_dag(cfg)

        # Step parameters
        uinputs = defaultdict(dict)
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            step = Step.create(classname)
            input_keys = step.keys('inputs', req_only=True)
            if input_keys:
                for pred in dag.predecessors(stepname):
                    # Remove any key that is already bound
                    for binding in dag[pred][stepname].get('bindings', []):
                        key = binding.split('.')[1]
                        #maybe it has been already removed
                        if key in input_keys:
                            input_keys.remove(key)

                if input_keys:
                    uinputs[stepname] = input_keys

        return uinputs
Example #3
0
    def create_steps(cfg):
        stepobjs = {}
        if 'sys_path' in cfg:
            sys.path.insert(0, cfg['sys_path'])
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            stepobjs[stepname] = Step.create(classname)
        if 'sys_path' in cfg:
            del sys.path[0]

        return stepobjs
Example #4
0
    def create_steps(cfg):
        stepobjs = {}
        if 'sys_path' in cfg:
            sys.path.insert(0, cfg['sys_path'])
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            stepobjs[stepname] = Step.create(classname)
        if 'sys_path' in cfg:
            del sys.path[0]

        return stepobjs
Example #5
0
 def get_metainfo(self, step_name):
     """
     Return a dictionary with generic information about pipeline and step
     """
     info = {}
     info['pipeline'] = { 'name':    self.name,
                          'version': self.__version__ }
     info['user'] = { 'login':    self.user,
                      'fullname': pwd.getpwnam(self.user).pw_gecos }
     step_class = self.dag.node[step_name]['class_name']
     stepobj = Step.create(step_class)
     info['step'] = { 'name': step_name,
                      'class': step_class,
                      'version': stepobj.__version__ }
     return info
Example #6
0
    def get_refgenomes(cls, cfg, unbound=None):
        """
        Return a 2 level dictionary containing the path of the reference
        genome grouped by labels.
        A label is a combination of species, version and variation
            {
                "label1": {
                    "stepname1" : { "input_key1" : "/path1"},
                    "stepname2" : { "input_key1" : "/path2"}
                }
                "label2": {
                    "stepname1" : { "input_key1  : "/path3"},
                    "stepname2" : { "input_key1" : "/path4"}
                }
            }
        The "unbound" dictionary contains the steps that have unbound inputs:
        if set, only those steps will be considered
        """

        refs = defaultdict(dict)
        tools = set()

        # Collect all tools that require a ref. genome
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            if unbound == None or stepname in unbound:
                step = Step.create(classname)
                for ref in step.get_refgenome_tools():
                    tools.add(ref['tool'])
                    refs[stepname][ref['name']] = ref['tool']

        # Get corresponding ref genomes
        refs_by_label = {}
        for ref in mongo.get_refgenomes(tools):
            label = "%s %s" % (ref['_id']['species'], ref['_id']['version'])
            if 'variation' in ref['_id']:
                label += " (%s)" % ref['_id']['variation']
            for stepname in refs:
                if not label in refs_by_label:
                    refs_by_label[label] = {}
                refs_by_label[label][stepname] = {}
                for param_key in refs[stepname]:
                    tool = refs[stepname][param_key]
                    if tool in ref['paths']:
                        refs_by_label[label][stepname][param_key] = ref[
                            'paths'][tool]

        return refs_by_label
Example #7
0
        def get(self, run_id):
            """
            Return the dag of the given run
            """

            pipeline = db.pipelines.find_one({'run_id': run_id}, {'config': 1, 'file_registry':1})
            file_registry = pipeline.get('file_registry', [])
            if file_registry:
                file_registry = json.loads(file_registry)

            conf_str = json.loads(pipeline['config'])
            config = Pipeline.load_cfg(conf_str);
            result_steps = config.get('config', {}).get('pipeline', {}).get('results', [])
            delete_steps = config.get('config', {}).get('pipeline', {}).get('delete', [])
            delete_steps.append('finalize')
            delete_steps.append('inputs')

            steps = list(db.steps.find(
                {"run_id":run_id, "name": {"$nin": delete_steps}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}},
                {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1}))

            outputs = {}
            for step in steps:
                if step.get('step_config', {}):
                    s = Step.load_step(step['step_config'])
                    output_files = []
                    for job_id, job in enumerate(step['jobs']):
                        for key in job['outputs']:
                            if key in s.keys(key_groups='outputs', key_filter={'type':'file'}):
                                for i, filename in enumerate(job['outputs'][key]):
                                    output = { 'path': filename }

                                    if not isinstance(filename, list):
                                        output['archived'] = (filename in file_registry)
                                    else:
                                        output['archived'] = False
                                    output_files.append(output)

                    if output_files:
                        outputs[step['name']] = defaultdict(list)
                        outputs[step['name']]['archive'] = step['name'] in result_steps

                        outputs[step['name']]['dir'] = step.get('outputs', {}).get('output_dir')
                        outputs[step['name']]['files'] = copy.deepcopy(output_files)


            return outputs
Example #8
0
    def get_refgenomes(cls, cfg, unbound=None):
        """
        Return a 2 level dictionary containing the path of the reference
        genome grouped by labels.
        A label is a combination of species, version and variation
            {
                "label1": {
                    "stepname1" : { "input_key1" : "/path1"},
                    "stepname2" : { "input_key1" : "/path2"}
                }
                "label2": {
                    "stepname1" : { "input_key1  : "/path3"},
                    "stepname2" : { "input_key1" : "/path4"}
                }
            }
        The "unbound" dictionary contains the steps that have unbound inputs:
        if set, only those steps will be considered
        """

        refs = defaultdict(dict)
        tools = set()

        # Collect all tools that require a ref. genome
        for stepname, classname in cfg['dag']['nodes'].iteritems():
            if unbound==None or stepname in unbound:
                step = Step.create(classname)
                for ref in step.get_refgenome_tools():
                    tools.add(ref['tool'])
                    refs[stepname][ref['name']] = ref['tool']

        # Get corresponding ref genomes
        refs_by_label = {}
        for ref in mongo.get_refgenomes(tools):
            label = "%s %s" % (ref['_id']['species'], ref['_id']['version'])
            if 'variation' in ref['_id']:
                label += " (%s)" % ref['_id']['variation']
            for stepname in refs:
                if not label in refs_by_label:
                    refs_by_label[label] = {}
                refs_by_label[label][stepname] = {}
                for param_key in refs[stepname]:
                    tool = refs[stepname][param_key]
                    if tool in ref['paths']:
                        refs_by_label[label][stepname][param_key] = ref['paths'][tool]

        return refs_by_label
Example #9
0
 def get_metainfo(self, step_name):
     """
     Return a dictionary with generic information about pipeline and step
     """
     info = {}
     info['pipeline'] = {'name': self.name, 'version': self.__version__}
     info['user'] = {
         'login': self.user,
         'fullname': pwd.getpwnam(self.user).pw_gecos
     }
     step_class = self.dag.node[step_name]['class_name']
     stepobj = Step.create(step_class)
     info['step'] = {
         'name': step_name,
         'class': step_class,
         'version': stepobj.__version__
     }
     return info
Example #10
0
    def run_step(self, step_name):
        """
        Configure and run a job for the given step
        """

        #skip the input step
        if step_name == 'inputs':
            self.completed.append(step_name)
            self.outputs[step_name] = self.cfg['config']['steps'].get(step_name, {})
            self.outputs[step_name]['output_dir'] = ''
            self.db.update_step_status(step_name, JOB_STATUS.RUNNING)
            self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED)
            self.db.set_step_outputs(step_name, self.outputs[step_name])
        else:
            if self.one_step:
                step_config = self.cfg
                step_config['sys_path'] = self.sys_path
                step_config['output_dir'] = self.output_dir
                step_config['meta'] = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} }}
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
            elif step_name == FINAL_STEP:
                step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } }
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
                step_config['name'] = FINAL_STEP
                step_config['step_class'] = self.dag.node[step_name]['class_name']
                step_config['target_dir'] = self.output_dir
                step_config['source_dir'] = self.work_dir
                step_config['output_dir'] = os.path.join(self.work_dir, step_name)
                self.configure_finalstep(step_config)
            else:
                step_config = { 'meta' : { 'pipeline':{}, 'step':{}, 'job':{} } }
                ut.dict_update(step_config['meta']['pipeline'], self.meta['pipeline'])
                step_class = self.dag.node[step_name]['class_name']
                step_config['name'] = step_name
                step_config['sys_path'] = self.sys_path
                step_config['step_class'] = step_class
                step_config['output_dir'] = os.path.join(self.work_dir, step_name)

                # 1. Form input keys
                # Remember: edges are labelled by 'from' keys
                for pred in self.dag.predecessors(step_name):
                    edge = self.dag[pred][step_name]
                    # Not an actual loop: just get key/value
                    for bind_to, bind_from in edge.get('bindings', {}).iteritems():
                        to_key = bind_to.split('.')[1]
                        if hasattr(bind_from, '__iter__'):
                            for from_key in bind_from:
                                key = from_key.split('.')[1]
                                out = self.outputs[pred][key]
                                if to_key in step_config:
                                    if isinstance(step_config[to_key], basestring):
                                        step_config[to_key] = [step_config[to_key]]
                                    step_config[to_key].extend(out)
                                else:
                                    step_config[to_key] = out
                        else:
                            from_key = bind_from.split('.')[1]
                            out = self.outputs[pred][from_key]
                            if to_key in step_config:
                                if isinstance(step_config[to_key], basestring):
                                    step_config[to_key] = [step_config[to_key]]
                                step_config[to_key].extend(out)
                            else:
                                step_config[to_key] = out

                    # Transfer metadata of previous step to next step
                    for key in self.meta['steps'].get(pred, {}):
                        step_config['meta'][key] = self.meta['steps'][pred][key]

            # 2. Form step config.
            if not self.one_step:
                ut.dict_update(step_config, self.cfg['config']['steps'].get(step_name, {}), replace=False)
                if step_name == FINAL_STEP:
                    # final step: pass full pipeline metadata
                    step_config['meta'].update(self.meta)
                else:
                    self.update_metadata(step_name, step_config[KEY_META])

            # 3. Submit step
            self.log.info('Executing step %s' % str(step_name))
            self.log.debug('  step configuration:\n %s' % ut.format_dict(step_config, indent=4))
            self.log.info('  step %s queued ' % str(step_name))

            self.running[step_name] = Step.load_step(step_config)
            job_counter = self.running[step_name].distribute()
            self.db.start_step(step_name, step_config, job_counter)
Example #11
0
    def validate_config(cls, cfg, user):
        """
        Check if all the config params are ok
        """

        retval = defaultdict(dict)
        s_errors = defaultdict(dict)

        #try:
        cfg = cls.load_cfg(cfg)
        params = cls.get_params(cfg)
        unb_inputs = cls.get_unbound_inputs(cfg)

        #validate step section
        for stepname in params['steps']:
            if stepname is not 'inputs':
                classname = cfg['dag']['nodes'][stepname]
                stepobj = Step.create(classname)
                if stepname in cfg['config']['steps']:
                    required_keys = []
                    required_keys.extend(unb_inputs.get(stepname, []))
                    required_keys.extend(stepobj.keys(['params'], req_only=True))
                    stepcfg = cfg['config']['steps'][stepname]
                    for key in required_keys:
                        if key in stepcfg:
                            param_spec = stepobj.key_spec(key)
                            error_msg = stepobj.validate_value(stepcfg[key], param_spec['type'], param_spec['name'])
                            if error_msg:
                                s_errors[stepname][key] = error_msg
                        else:
                            s_errors[stepname][key] = 'missing value'
                else:
                    for key in stepobj.keys(['params'], req_only=True):
                        s_errors[stepname][key] = 'missing value'
                    if stepname in unb_inputs:
                        for key in unb_inputs[stepname]:
                            s_errors[stepname][key] = 'missing value'


        #validate pipeline section
        p_errors = {}
        if not cfg['config']['pipeline']['project_name']:
            p_errors['project_name'] = 'missing value'

        if not cfg['config']['pipeline']['description']:
            p_errors['description'] = 'missing value'

        if not cfg['config']['pipeline']['output_dir']:
            p_errors['output_dir'] = 'missing value'
        else:
            output_dir = cfg['config']['pipeline']['output_dir']
            if not output_dir.startswith('/'):
                p_errors['output_dir'] = '%s : not an absolute path' % output_dir
            if not isinstance(output_dir, basestring):
                p_errors['output_dir'] = '%s : invalid type, found %s, expected %s' % (output_dir, type(output_dir), 'str')
            #elif not ut.has_write_access(output_dir):
            #    p_errors['output_dir'] = '%s : not writable by user' % (output_dir)

        if s_errors:
            retval['steps'] = s_errors

        if p_errors:
            retval['pipeline'] = p_errors

        return retval
Example #12
0
        def post(self, run_id):
            """
            Pushes files into iRODS
            """

            data = request.get_json(force=True)

            runmeta = data.get('meta')
            selection = data.get('selection')
            user = auth_get_username(request.authorization, data.get('user'))

            npdis = dbmodel.get_npdi_projects()
            npdi = runmeta.get('Project NPDI ID', '')
            study_nickname = runmeta.get('Study nickname',
                                         'Required field missing')
            if (npdi + study_nickname) not in npdis:
                return {
                    'pipeline': {
                        'Project': '%s (%s)' % (npdi, study_nickname)
                    }
                }, 400

            run = db.pipelines.find_one({'run_id': run_id}, {
                'meta': 1,
                'run_id': 1
            })

            steps_names = selection.keys()
            steps = list(
                db.steps.find(
                    {
                        "run_id": run_id,
                        "name": {
                            '$in': steps_names
                        },
                        "jobs": {
                            "$elemMatch": {
                                "outputs": {
                                    "$exists": True
                                }
                            }
                        }
                    }, {
                        "name": 1,
                        "jobs": 1,
                        "outputs.output_dir": 1,
                        "step_config": 1
                    }))

            outputs = {}
            for step in steps:
                if step.get('step_config', {}):
                    s = Step.load_step(step['step_config'])
                    output_files = {}
                    for job_id, job in enumerate(step['jobs']):
                        for key in job['outputs']:
                            if key in s.keys(key_groups='outputs',
                                             key_filter={'type': 'file'}):
                                for i, filename in enumerate(
                                        job['outputs'][key]):
                                    filemeta = {
                                        'step': step['name'],
                                        'job_id': job_id
                                    }
                                    ext = os.path.splitext(
                                        filename)[1][1:].upper()
                                    for key in job.get('meta', {}):
                                        meta = job['meta'][key]
                                        if key == 'sample_id':
                                            okey = 'Operational sample accession'
                                        else:
                                            okey = key

                                        if isinstance(meta, list):
                                            filemeta[okey] = meta[i]
                                        else:
                                            filemeta[okey] = meta

                                    filemeta[
                                        'File type'] = 'Processed data file'
                                    filemeta['File format'] = ext

                                    output_files[filename] = filemeta

                    if output_files:
                        outputs[step['name']] = output_files

            input_files = []
            meta_data = []
            for step_name, step_selection in selection.iteritems():
                for filepath in step_selection:
                    input_files.append(filepath)

                    filemeta = outputs[step_name][filepath]
                    filemeta.update(runmeta)
                    meta_data.append(filemeta)

            cfg = Pipeline.load_cfg(pipeline_specs['irods_lz'])
            cfg['config']['steps']['irods_mvtolz'] = {
                'input_files': input_files,
                'meta_data': meta_data
            }
            cfg['config']['steps']['irods_monitorlz'] = {
                'prun_id': run['run_id']
            }

            cfg['config']['pipeline']['project_name'] = run['meta'][
                'project_name']
            cfg['config']['pipeline'][
                'description'] = 'Archive data for run %s' % run['run_id']
            cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods'

            # Get id from DB
            db_info = dbmodel.PipelineDb(cfg['name'], cfg,
                                         Pipeline.ordered_steps(cfg), user)
            cfg['run_id'] = db_info.run_id

            ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" %
                            (cfg['label'], cfg['run_id'], user))
            return pm.add_pipeline(cfg, user)
Example #13
0
    def run_step(self, step_name):
        """
        Configure and run a job for the given step
        """

        #skip the input step
        if step_name == 'inputs':
            self.completed.append(step_name)
            self.outputs[step_name] = self.cfg['config']['steps'].get(
                step_name, {})
            self.outputs[step_name]['output_dir'] = ''
            self.db.update_step_status(step_name, JOB_STATUS.RUNNING)
            self.db.update_step_status(step_name, JOB_STATUS.SUCCEEDED)
            self.db.set_step_outputs(step_name, self.outputs[step_name])
        else:
            if self.one_step:
                step_config = self.cfg
                step_config['sys_path'] = self.sys_path
                step_config['output_dir'] = self.output_dir
                step_config['meta'] = {
                    'meta': {
                        'pipeline': {},
                        'step': {},
                        'job': {}
                    }
                }
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
            elif step_name == FINAL_STEP:
                step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}}
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
                step_config['name'] = FINAL_STEP
                step_config['step_class'] = self.dag.node[step_name][
                    'class_name']
                step_config['target_dir'] = self.output_dir
                step_config['source_dir'] = self.work_dir
                step_config['output_dir'] = os.path.join(
                    self.work_dir, step_name)
                self.configure_finalstep(step_config)
            else:
                step_config = {'meta': {'pipeline': {}, 'step': {}, 'job': {}}}
                ut.dict_update(step_config['meta']['pipeline'],
                               self.meta['pipeline'])
                step_class = self.dag.node[step_name]['class_name']
                step_config['name'] = step_name
                step_config['sys_path'] = self.sys_path
                step_config['step_class'] = step_class
                step_config['output_dir'] = os.path.join(
                    self.work_dir, step_name)

                # 1. Form input keys
                # Remember: edges are labelled by 'from' keys
                for pred in self.dag.predecessors(step_name):
                    edge = self.dag[pred][step_name]
                    # Not an actual loop: just get key/value
                    for bind_to, bind_from in edge.get('bindings',
                                                       {}).iteritems():
                        to_key = bind_to.split('.')[1]
                        if hasattr(bind_from, '__iter__'):
                            for from_key in bind_from:
                                key = from_key.split('.')[1]
                                out = self.outputs[pred][key]
                                if to_key in step_config:
                                    if isinstance(step_config[to_key],
                                                  basestring):
                                        step_config[to_key] = [
                                            step_config[to_key]
                                        ]
                                    step_config[to_key].extend(out)
                                else:
                                    step_config[to_key] = out
                        else:
                            from_key = bind_from.split('.')[1]
                            out = self.outputs[pred][from_key]
                            if to_key in step_config:
                                if isinstance(step_config[to_key], basestring):
                                    step_config[to_key] = [step_config[to_key]]
                                step_config[to_key].extend(out)
                            else:
                                step_config[to_key] = out

                    # Transfer metadata of previous step to next step
                    for key in self.meta['steps'].get(pred, {}):
                        step_config['meta'][key] = self.meta['steps'][pred][
                            key]

            # 2. Form step config.
            if not self.one_step:
                ut.dict_update(step_config,
                               self.cfg['config']['steps'].get(step_name, {}),
                               replace=False)
                if step_name == FINAL_STEP:
                    # final step: pass full pipeline metadata
                    step_config['meta'].update(self.meta)
                else:
                    self.update_metadata(step_name, step_config[KEY_META])

            # 3. Submit step
            self.log.info('Executing step %s' % str(step_name))
            self.log.debug('  step configuration:\n %s' %
                           ut.format_dict(step_config, indent=4))
            self.log.info('  step %s queued ' % str(step_name))

            self.running[step_name] = Step.load_step(step_config)
            job_counter = self.running[step_name].distribute()
            self.db.start_step(step_name, step_config, job_counter)
Example #14
0
    def validate_config(cls, cfg, user):
        """
        Check if all the config params are ok
        """

        retval = defaultdict(dict)
        s_errors = defaultdict(dict)

        #try:
        cfg = cls.load_cfg(cfg)
        params = cls.get_params(cfg)
        unb_inputs = cls.get_unbound_inputs(cfg)

        #validate step section
        for stepname in params['steps']:
            if stepname is not 'inputs':
                classname = cfg['dag']['nodes'][stepname]
                stepobj = Step.create(classname)
                if stepname in cfg['config']['steps']:
                    required_keys = []
                    required_keys.extend(unb_inputs.get(stepname, []))
                    required_keys.extend(
                        stepobj.keys(['params'], req_only=True))
                    stepcfg = cfg['config']['steps'][stepname]
                    for key in required_keys:
                        if key in stepcfg:
                            param_spec = stepobj.key_spec(key)
                            error_msg = stepobj.validate_value(
                                stepcfg[key], param_spec['type'],
                                param_spec['name'])
                            if error_msg:
                                s_errors[stepname][key] = error_msg
                        else:
                            s_errors[stepname][key] = 'missing value'
                else:
                    for key in stepobj.keys(['params'], req_only=True):
                        s_errors[stepname][key] = 'missing value'
                    if stepname in unb_inputs:
                        for key in unb_inputs[stepname]:
                            s_errors[stepname][key] = 'missing value'

        #validate pipeline section
        p_errors = {}
        if not cfg['config']['pipeline']['project_name']:
            p_errors['project_name'] = 'missing value'

        if not cfg['config']['pipeline']['description']:
            p_errors['description'] = 'missing value'

        if not cfg['config']['pipeline']['output_dir']:
            p_errors['output_dir'] = 'missing value'
        else:
            output_dir = cfg['config']['pipeline']['output_dir']
            if not output_dir.startswith('/'):
                p_errors[
                    'output_dir'] = '%s : not an absolute path' % output_dir
            if not isinstance(output_dir, basestring):
                p_errors[
                    'output_dir'] = '%s : invalid type, found %s, expected %s' % (
                        output_dir, type(output_dir), 'str')
            #elif not ut.has_write_access(output_dir):
            #    p_errors['output_dir'] = '%s : not writable by user' % (output_dir)

        if s_errors:
            retval['steps'] = s_errors

        if p_errors:
            retval['pipeline'] = p_errors

        return retval
Example #15
0
        def post(self, run_id):
            """
            Pushes files into iRODS
            """

            data = request.get_json(force=True)

            runmeta   = data.get('meta')
            selection = data.get('selection')
            user      = auth_get_username(request.authorization, data.get('user'))

            npdis = dbmodel.get_npdi_projects()
            npdi = runmeta.get('Project NPDI ID', '')
            study_nickname = runmeta.get('Study nickname', 'Required field missing')
            if (npdi + study_nickname) not in npdis:
                return {'pipeline': {
                            'Project': '%s (%s)' %(npdi, study_nickname)
                        }}, 400

            run = db.pipelines.find_one({'run_id': run_id}, {'meta':1, 'run_id':1})

            steps_names = selection.keys()
            steps = list(db.steps.find(
                {"run_id":run_id, "name": {'$in': steps_names}, "jobs": {"$elemMatch": {"outputs": {"$exists": True}}}},
                {"name":1, "jobs":1, "outputs.output_dir": 1, "step_config": 1}))

            outputs = {}
            for step in steps:
                if step.get('step_config', {}):
                    s = Step.load_step(step['step_config'])
                    output_files = {}
                    for job_id, job in enumerate(step['jobs']):
                        for key in job['outputs']:
                            if key in s.keys(key_groups='outputs', key_filter={'type':'file'}):
                                for i, filename in enumerate(job['outputs'][key]):
                                    filemeta = {'step': step['name'], 'job_id': job_id}
                                    ext = os.path.splitext(filename)[1][1:].upper()
                                    for key in job.get('meta', {}):
                                        meta = job['meta'][key]                                       
                                        if key == 'sample_id':
                                            okey = 'Operational sample accession'
                                        else:
                                            okey = key

                                        if isinstance(meta, list):
                                            filemeta[okey] = meta[i]
                                        else:
                                            filemeta[okey] = meta

                                    filemeta['File type'] = 'Processed data file'
                                    filemeta['File format'] = ext

                                    output_files[filename] = filemeta

                    if output_files:
                        outputs[step['name']] = output_files


            input_files = []
            meta_data   = []
            for step_name, step_selection in selection.iteritems():
                for filepath in step_selection:
                    input_files.append(filepath)

                    filemeta = outputs[step_name][filepath]
                    filemeta.update(runmeta)
                    meta_data.append(filemeta)

            cfg = Pipeline.load_cfg(pipeline_specs['irods_lz'])
            cfg['config']['steps']['irods_mvtolz'] = {
                'input_files' : input_files,
                'meta_data'   : meta_data
            }
            cfg['config']['steps']['irods_monitorlz'] = {
                'prun_id' : run['run_id']
            }

            cfg['config']['pipeline']['project_name'] = run['meta']['project_name']
            cfg['config']['pipeline']['description'] = 'Archive data for run %s' %run['run_id']
            cfg['config']['pipeline']['output_dir'] = '/scratch/cgi/irods'

            # Get id from DB
            db_info = dbmodel.PipelineDb(cfg['name'], cfg, Pipeline.ordered_steps(cfg), user)
            cfg['run_id'] = db_info.run_id

            ut.pretty_print("Submitting pipeline %s (ID %d) for user %s" % (cfg['label'], cfg['run_id'], user))
            return pm.add_pipeline(cfg, user)
Example #16
0
    def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"):
        """
        Read in the pipeline graph and load the configuration.
        """
        self.all_ok = True
        self.user = user
        self.status = JOB_STATUS.QUEUED
        self.lock = ''

        self.completed = []
        self.running = {}
        self.outputs = {}
        self.schedname = schedname
        db_model_name = "MONGO_DB" if db else "STUB_DB"

        # Load configuration
        self.one_step = False
        try:
            self.cfg = Pipeline.load_cfg(cfg)
        except Exception as e1:
            print('Failed to load config as pipeline (error=%s). Trying as step' % e1)
            try:
                self.cfg = Step.load_cfg(cfg)
                self.step = Step.load_step(self.cfg)
                self.one_step = True
            except Exception as e2:
                 Exception("Unable to load config file %s:\n" \
                           "pipeline load: %s\n" \
                           "step load: %s" % (cfg, e1, e2))

        # Set all additional information
        self.run_id = self.cfg.get('run_id')
        if self.one_step:
            self.name  = self.step.name
            self.label = self.step.name
            self.project_name = self.cfg.get('project_name', '')
            self.description  = self.cfg.get('description', '')
            self.output_dir   = self.step.output_dir
            self.ordered      = [self.step.name]
        else:
            self.name  = self.cfg['name']
            self.label = self.cfg['label']
            self.project_name = self.cfg['config']['pipeline'].get('project_name', '')
            self.description  = self.cfg['config']['pipeline'].get('description', '')
            self.output_dir   = self.cfg['config']['pipeline']['output_dir']
            if not self.output_dir.startswith('/scratch'):
                self.cfg['dag']['nodes'][FINAL_STEP] = 'utils.Finalize' #TODO: Make it work for one_step as well
            self.ordered      = Pipeline.ordered_steps(self.cfg)


        self.sys_path = self.cfg.get('sys_path')
        if self.sys_path:
            sys.path.insert(0, self.sys_path)

        self.dag = self.create_dag(self.cfg, one_step=self.one_step)

        self.meta = {
            'pipeline': {
                'label': self.label,
                'project_name': self.project_name,
                'descr': self.description,
                'run_id': self.run_id
            },
            'steps': {},
            'job' : {}
        }

        self.db = db_models[db_model_name](self.name, self.cfg, self.ordered, self.user, output_dir=self.output_dir)
        if hasattr(self.db, 'run_id'):
            self.run_id = self.db.run_id
            self.cfg['run_id'] = self.run_id

        # Define the output directories
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir, 0775)

        # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.)
        # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run;
        # and c) the user-defined directory is not already under /scratch
        if self.run_id and not (self.name == 'demultiplexing'):
            dirname = '%s_%d' % (self.name, self.db.run_id)
            self.output_dir = os.path.join(self.output_dir, dirname)
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir, 0775)
            # In case of /scratch, do not create an additional sub-directory
            if self.output_dir.startswith('/scratch'):
                self.work_dir = self.output_dir
            else:
                self.work_dir = os.path.join(WORK_DIR, self.user, dirname)
                if not os.path.exists(self.work_dir):
                    os.makedirs(self.work_dir, 0775)
                symlink = os.path.join(self.output_dir, 'work_area')
                if not os.path.exists(symlink):
                    os.symlink(self.work_dir, symlink)
        else:
            self.work_dir = self.output_dir

        ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' % (self.output_dir, self.work_dir))
        self.db.update_pipeline(self.run_id, {'output_dir': self.output_dir,
                                              'work_dir':   self.work_dir })
Example #17
0
    def __init__(self, cfg, user='******', db=True, schedname="SCHED_CONDOR"):
        """
        Read in the pipeline graph and load the configuration.
        """
        self.all_ok = True
        self.user = user
        self.status = JOB_STATUS.QUEUED
        self.lock = ''

        self.completed = []
        self.running = {}
        self.outputs = {}
        self.schedname = schedname
        db_model_name = "MONGO_DB" if db else "STUB_DB"

        # Load configuration
        self.one_step = False
        try:
            self.cfg = Pipeline.load_cfg(cfg)
        except Exception as e1:
            print(
                'Failed to load config as pipeline (error=%s). Trying as step'
                % e1)
            try:
                self.cfg = Step.load_cfg(cfg)
                self.step = Step.load_step(self.cfg)
                self.one_step = True
            except Exception as e2:
                Exception("Unable to load config file %s:\n" \
                          "pipeline load: %s\n" \
                          "step load: %s" % (cfg, e1, e2))

        # Set all additional information
        self.run_id = self.cfg.get('run_id')
        if self.one_step:
            self.name = self.step.name
            self.label = self.step.name
            self.project_name = self.cfg.get('project_name', '')
            self.description = self.cfg.get('description', '')
            self.output_dir = self.step.output_dir
            self.ordered = [self.step.name]
        else:
            self.name = self.cfg['name']
            self.label = self.cfg['label']
            self.project_name = self.cfg['config']['pipeline'].get(
                'project_name', '')
            self.description = self.cfg['config']['pipeline'].get(
                'description', '')
            self.output_dir = self.cfg['config']['pipeline']['output_dir']
            if not self.output_dir.startswith('/scratch'):
                self.cfg['dag']['nodes'][
                    FINAL_STEP] = 'utils.Finalize'  #TODO: Make it work for one_step as well
            self.ordered = Pipeline.ordered_steps(self.cfg)

        self.sys_path = self.cfg.get('sys_path')
        if self.sys_path:
            sys.path.insert(0, self.sys_path)

        self.dag = self.create_dag(self.cfg, one_step=self.one_step)

        self.meta = {
            'pipeline': {
                'label': self.label,
                'project_name': self.project_name,
                'descr': self.description,
                'run_id': self.run_id
            },
            'steps': {},
            'job': {}
        }

        self.db = db_models[db_model_name](self.name,
                                           self.cfg,
                                           self.ordered,
                                           self.user,
                                           output_dir=self.output_dir)
        if hasattr(self.db, 'run_id'):
            self.run_id = self.db.run_id
            self.cfg['run_id'] = self.run_id

        # Define the output directories
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir, 0775)

        # Use default output dir under /scratch/cgi/nespipe (linked to user-defined dir.)
        # if: a) this run is using the db (so we have a run ID); b) it is not a demux. run;
        # and c) the user-defined directory is not already under /scratch
        if self.run_id and not (self.name == 'demultiplexing'):
            dirname = '%s_%d' % (self.name, self.db.run_id)
            self.output_dir = os.path.join(self.output_dir, dirname)
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir, 0775)
            # In case of /scratch, do not create an additional sub-directory
            if self.output_dir.startswith('/scratch'):
                self.work_dir = self.output_dir
            else:
                self.work_dir = os.path.join(WORK_DIR, self.user, dirname)
                if not os.path.exists(self.work_dir):
                    os.makedirs(self.work_dir, 0775)
                symlink = os.path.join(self.output_dir, 'work_area')
                if not os.path.exists(symlink):
                    os.symlink(self.work_dir, symlink)
        else:
            self.work_dir = self.output_dir

        ut.pretty_print('Output directories: output_dir=%s, work_dir=%s' %
                        (self.output_dir, self.work_dir))
        self.db.update_pipeline(self.run_id, {
            'output_dir': self.output_dir,
            'work_dir': self.work_dir
        })
Example #18
0
        def get(self, run_id):
            """
            Return the dag of the given run
            """

            pipeline = db.pipelines.find_one({'run_id': run_id}, {
                'config': 1,
                'file_registry': 1
            })
            file_registry = pipeline.get('file_registry', [])
            if file_registry:
                file_registry = json.loads(file_registry)

            conf_str = json.loads(pipeline['config'])
            config = Pipeline.load_cfg(conf_str)
            result_steps = config.get('config', {}).get('pipeline',
                                                        {}).get('results', [])
            delete_steps = config.get('config', {}).get('pipeline',
                                                        {}).get('delete', [])
            delete_steps.append('finalize')
            delete_steps.append('inputs')

            steps = list(
                db.steps.find(
                    {
                        "run_id": run_id,
                        "name": {
                            "$nin": delete_steps
                        },
                        "jobs": {
                            "$elemMatch": {
                                "outputs": {
                                    "$exists": True
                                }
                            }
                        }
                    }, {
                        "name": 1,
                        "jobs": 1,
                        "outputs.output_dir": 1,
                        "step_config": 1
                    }))

            outputs = {}
            for step in steps:
                if step.get('step_config', {}):
                    s = Step.load_step(step['step_config'])
                    output_files = []
                    for job_id, job in enumerate(step['jobs']):
                        for key in job['outputs']:
                            if key in s.keys(key_groups='outputs',
                                             key_filter={'type': 'file'}):
                                for i, filename in enumerate(
                                        job['outputs'][key]):
                                    output = {'path': filename}

                                    if not isinstance(filename, list):
                                        output['archived'] = (filename
                                                              in file_registry)
                                    else:
                                        output['archived'] = False
                                    output_files.append(output)

                    if output_files:
                        outputs[step['name']] = defaultdict(list)
                        outputs[step['name']]['archive'] = step[
                            'name'] in result_steps

                        outputs[step['name']]['dir'] = step.get(
                            'outputs', {}).get('output_dir')
                        outputs[step['name']]['files'] = copy.deepcopy(
                            output_files)

            return outputs