Beispiel #1
0
    def copy(cls,
             src_uri=None,
             parsed_src_uri=None,
             dest_uri=None,
             parsed_dest_uri=None,
             **kwargs):
        """
        Copy data to/from/within workflow contexts.

        Source and destination URIs are parsed to extract contexts, and
        the appropriate methods are called accordingly.

        Args:
            src: Source URI.
            dest: Destination URI.
            **kwargs: Other arguments specific to context.

        Returns:
            On success: True.
            On failure: False.

        """
        # parse and validate src URI
        if not parsed_src_uri:
            parsed_src_uri = URIParser.parse(src_uri)
            if not parsed_src_uri:
                Log.an().error('invalid src uri: %s', src_uri)
                return False

        # parse and validate dest URI
        if not parsed_dest_uri:
            parsed_dest_uri = URIParser.parse(dest_uri)
            if not parsed_dest_uri:
                Log.an().error('invalid dest uri: %s', dest_uri)
                return False

        # check if copy method exists for contexts
        try:
            copy_func = getattr(
                cls, '_copy_{}_{}'.format(parsed_src_uri['scheme'],
                                          parsed_dest_uri['scheme']))
        except AttributeError:
            Log.an().error('_copy_%s_%s method not defined',
                           parsed_src_uri['scheme'], parsed_dest_uri['scheme'])
            return False

        return copy_func(
            parsed_src_uri, parsed_dest_uri, **{
                list_item: kwargs[list_item]
                for list_item in set(
                    [parsed_src_uri['scheme'], parsed_dest_uri['scheme']])
            })
Beispiel #2
0
    def mkdir(cls, uri=None, parsed_uri=None, recursive=False, **kwargs):
        """
        Create directory at URI.

        URIs are parsed to extract contexts, and the appropriate method is
        called. Either uri or parsed_uri may be specified, but not both, if
        both are specified, parsed_uri is used.

        Args:
            uri: URI to create.
            parsed_uri: URI to create, already parsed.
            recursive: If true, recursively create parent directories.
            **kwargs: Other arguments specific to context.

        Returns:
            On success: True.
            On failure: False.

        """
        # parse and validate URI
        if not parsed_uri:
            parsed_uri = URIParser.parse(uri)
            if not parsed_uri:
                Log.an().error('invalid uri: %s', uri)
                return False

        # check if the mkdir method exists for context
        if recursive:
            try:
                mkdir_func = getattr(cls, '_mkdir_recursive_{}'\
                    .format(parsed_uri['scheme']))

            except AttributeError:
                Log.an().error('_mkdir_recursive_%s method not defined',
                               parsed_uri['scheme'])
                return False

        else:
            try:
                mkdir_func = getattr(cls, '_mkdir_{}'\
                    .format(parsed_uri['scheme']))

            except AttributeError:
                Log.an().error('_mkdir_%s method not defined',
                               parsed_uri['scheme'])
                return False

        # always remove final slash from URI before calling mkdir
        return mkdir_func(URIParser.parse(parsed_uri['chopped_uri']), **kwargs)
Beispiel #3
0
    def upload_agave_test_data(self):
        """
        Upload Agave test data from workflow package.

        Args:
            self: class instance.

        Returns:
            None

        """
        if (not self._agave_wrapper or not self._agave_params
                or not self._agave_params.get('agave')):
            Log.a().warning(
                'must provide agave parameters to upload test data')
            return False

        # create main test data URI
        parsed_base_test_uri = URIParser.parse('agave://{}/{}'.format(
            self._agave_params['agave']['deploymentSystem'],
            self._agave_params['agave']['testDataDir']))
        Log.some().info('creating base test data uri: %s',
                        parsed_base_test_uri['chopped_uri'])
        if not DataManager.mkdir(parsed_uri=parsed_base_test_uri,
                                 recursive=True,
                                 agave={'agave_wrapper': self._agave_wrapper}):
            Log.a().warning('cannot create base test data uri: %s',
                            parsed_base_test_uri['chopped_uri'])
            return False

        # upload test data
        parsed_local_test_uri = URIParser.parse(str(Path(self._path) / 'data'))
        parsed_agave_test_uri = URIParser.parse('{}/{}'.format(
            parsed_base_test_uri['chopped_uri'],
            Path(self._path).name))
        Log.some().info('copying test data from %s to %s',
                        parsed_local_test_uri['chopped_uri'],
                        parsed_agave_test_uri['chopped_uri'])
        if not DataManager.copy(parsed_src_uri=parsed_local_test_uri,
                                parsed_dest_uri=parsed_agave_test_uri,
                                local={},
                                agave={'agave_wrapper': self._agave_wrapper}):
            Log.a().warning('cannot copy test data from %s to %s',
                            parsed_local_test_uri['chopped_uri'],
                            parsed_agave_test_uri['chopped_uri'])
            return False

        return True
Beispiel #4
0
    def list(cls, uri=None, parsed_uri=None, globstr='*', **kwargs):
        """
        List data in various contexts.

        URIs are parsed to extract contexts, and the appropriate method is
        called. Either uri or parsed_uri may be specified, but not both. If
        both are specified, parsed_uri is used.

        Args:
            uri: URI to list.
            parsed_uri: URI to list, already parsed.
            **kwargs: Other arguments specific to context.

        Returns:
            On success: True.
            On failure: False.

        """
        # parse and validate URI
        if not parsed_uri:
            parsed_uri = URIParser.parse(uri)
            if not parsed_uri:
                Log.an().error('invalid uri: %s', uri)
                return False

        # check if list method exists for context
        try:
            list_func = getattr(cls, '_list_{}'.format(parsed_uri['scheme']))
        except AttributeError:
            Log.an().error('_list_%s method not defined', parsed_uri['scheme'])
            return False

        return list_func(parsed_uri, globstr, **kwargs)
Beispiel #5
0
    def _init_archive_uri(self):
        """
        Initialize and validate Agave job archive URI.

        Args:
            None.

        Returns:
            On success: True.
            On failure: False.

        """
        if 'agave' not in self._parsed_job_work_uri:
            Log.an().error('job work uri must include an agave context')
            return False

        # construct archive URI
        self._parsed_archive_uri = URIParser.parse('{}/_agave_jobs'.format(
            self._parsed_job_work_uri['agave']['chopped_uri']))
        if not self._parsed_archive_uri:
            Log.an().error('invalid job work uri: %s',
                           self._parsed_job_work_uri['agave'])
            return False

        # create URI
        if not DataManager.mkdir(parsed_uri=self._parsed_archive_uri,
                                 recursive=True,
                                 agave=self.get_context_options()):
            Log.an().error('cannot create agave archive uri: %s',
                           self._parsed_archive_uri['chopped_uri'])
            return False

        return True
def _mkdir_recursive_agave(uri, agave):
    """
    Recursively create agave directory specified by URI.

    Args:
        uri: parsed URI to create.
        agave: dict that contains:
            agave_wrapper: Agave wrapper object.

    Returns:
        On success: True.
        On failure: False.

    """
    if uri['folder'] != '/':

        # make sure parent folder exists first
        parent_uri = URIParser.parse(
            '{}://{}{}'.format(
                uri['scheme'], uri['authority'], uri['folder']
            )
        )
        if not _exists_agave(parent_uri, agave):
            # parent folder does not exist, create
            if not _mkdir_recursive_agave(parent_uri, agave):
                Log.an().error(
                    'cannot create parent folder at uri: %s',
                    parent_uri['chopped_uri']
                )
                return False

    return _mkdir_agave(uri, agave)
Beispiel #7
0
    def _init_data_context_set(self):
        """
        Initialize set of data contexts, which is determined by inputs and output.

        Args:
            self: class instance

        Returns:
            On success: True.
            On failure: False.

        """
        # check input URIs for data contexts
        for input_key in self._workflow['inputs']:
            parsed_uri = URIParser.parse(self._workflow['inputs'][input_key]['value'][0])
            if not parsed_uri:
                msg = 'invalid input uri: {}'.format(
                    self._workflow['inputs'][input_key]['value'][0]
                )
                Log.an().error(msg)
                return self._fatal(msg)

            self._data_contexts.add(parsed_uri['scheme'])

        # add output URI data context
        parsed_output_uri = URIParser.parse(self._job['output_uri'])
        if not parsed_output_uri:
            msg = 'invalid base of job output uri: {}'.format(
                self._job['output_uri']
            )
            Log.an().error(msg)
            return self._fatal(msg)

        self._data_contexts.add(parsed_output_uri['scheme'])

        # check validity of data contexts
        for context in self._data_contexts:
            if not Contexts.is_data_context(context):
                msg = 'invalid data context: {}'.format(context)
                Log.an().error(msg)
                return self._fatal(msg)

        Log.some().debug('data contexts: %s', self._data_contexts)

        return True
Beispiel #8
0
    def _init_app_paths(self):
        """
        Add app paths to environment PATH for local workflows.

        The package path contains the workflow definition YAML file and shell
        scripts for calling individual apps used in a workflow.

        Args:
            None.

        Output:
            On success: True.
            On failure: False.

        """
        parsed_uri = URIParser.parse(self._workflow_path)
        if not parsed_uri:
            Log.an().error('invalid workflow path: %s', self._workflow_path)
            return False

        apps_uri = ('{}{}' if parsed_uri['folder'] == '/' else '{}/{}')\
            .format(parsed_uri['folder'], 'apps')
        parsed_apps_uri = URIParser.parse(
            ('{}{}' if parsed_uri['folder'] == '/' else '{}/{}')\
            .format(parsed_uri['folder'], 'apps')
        )
        if not parsed_apps_uri:
            Log.an().error('cannot construct apps uri: %s', apps_uri)
            return False

        if not DataManager.exists(parsed_uri=parsed_apps_uri):
            # no apps directory
            return True

        for app_dir in DataManager.list(parsed_uri=parsed_apps_uri):
            try:
                os.environ['PATH'] = '{}{}{}'.format(
                    os.path.join(parsed_apps_uri['chopped_path'], app_dir,
                                 'assets'), os.pathsep, os.environ['PATH'])

            except OSError as err:
                Log.an().error('workflow app pathmunge error [%s]', str(err))
                return False

        return True
Beispiel #9
0
    def initialize(self):
        """
        Initialize instance of StageableData.

        By parsing URIs with GeneFlow.URIParser & checking source context
        validity.

        Args:
            self: class instance.

        Returns:
            On success: True.
            On failure: False.

        """
        # parse data uris
        for context in self._data_uris:
            parsed_uri = URIParser.parse(self._data_uris[context])
            if not parsed_uri:
                msg = 'invalid data uri for context: {}->{}'.format(
                    context, self._data_uris[context]
                )
                Log.an().error(msg)
                return self._fatal(msg)

            # path cannot be root (/)
            if parsed_uri['chopped_path'] == '/':
                msg = 'context data uri cannot be root (/): {}->{}'.format(
                    context, self._data_uris[context]
                )
                Log.an().error(msg)
                return self._fatal(msg)

            self._parsed_data_uris[context] = parsed_uri

        # make sure source_context is one of the listed of URIs
        if self._source_context not in self._parsed_data_uris:
            msg = 'source context must one of the data uri contexts: {}'.\
                format(self._source_context)
            Log.an().error(msg)
            return self._fatal(msg)

        return True
Beispiel #10
0
    def files_list(self, system_id, file_path, depth=1):
        """
        Wrap AgavePy file listing command.

        Args:
            self: class instance.
            system_id: Identifier for Agave storage system.
            file_path: Path for file listing.

        Returns:
            List of file names.

        """
        files = []
        file_list = self._agave.files.list(systemId=system_id,
                                           filePath=file_path + '/',
                                           limit=1000000)
        for f in file_list:
            if f.name[:1] != '.' and f.name != '':
                file_uri = URIParser.parse('{}://{}{}'.format(
                    'agave', system_id, f.path))
                files.append({
                    'path': file_uri['folder'],
                    'name': file_uri['name'],
                    'type': f.type
                })

        # list all subdirectories if not at max depth
        # depth of -1 means unlimited depth
        files_subdirs = {}
        if depth > 1 or depth == -1:
            for f in files:
                if f['type'] == 'dir':
                    files_subdirs[f['name']] = self.files_list(
                        system_id, f['path'] + '/' + f['name'],
                        depth - 1 if depth > 1 else depth)

        # append all items in files_subdirs to files
        return files + list(itertools.chain(*files_subdirs.values()))
Beispiel #11
0
    def exists(cls, uri=None, parsed_uri=None, **kwargs):
        """
        Check if URI exists.

        URIs are parsed to extract contexts, and the appropriate method is
        called. Either uri or parsed_uri may be specified, but not both, if
        both are specified, parsed_uri is used.

        Args:
            uri: URI to check.
            parsed_uri: URI to check, already parsed.
            **kwargs: Other arguments specific to context.

        Returns:
            True if the URI exists, False if it doesn't exist, None if
            an exception occurs.

        """
        # parse and validate URI
        if not parsed_uri:
            parsed_uri = URIParser.parse(uri)
            if not parsed_uri:
                Log.an().error('invalid uri: %s', uri)
                return None

        # check if the exists method exists for context
        try:
            exists_func = getattr(cls, '_exists_{}'\
                .format(parsed_uri['scheme']))

        except AttributeError:
            Log.an().error('_exists_%s method not defined',
                           parsed_uri['scheme'])
            return None

        return exists_func(parsed_uri, **kwargs)
Beispiel #12
0
    def delete(cls, uri=None, parsed_uri=None, **kwargs):
        """
        Delete URI.

        URIs are parsed to extract contexts, and the appropriate method is
        called. Either uri or parsed_uri may be specified, but not both, if
        both are specified, parsed_uri is used.

        Args:
            uri: URI to delete.
            parsed_uri: URI to delete, already parsed.
            **kwargs: Other arguments specific to context.

        Returns:
            On success: True.
            On failure: False.

        """
        # parse and validate URI
        if not parsed_uri:
            parsed_uri = URIParser.parse(uri)
            if not parsed_uri:
                Log.an().error('invalid uri: %s', uri)
                return False

        # check if the delete method exists for context
        try:
            delete_func = getattr(cls, '_delete_{}'\
                .format(parsed_uri['scheme']))

        except AttributeError:
            Log.an().error('_delete_%s method not defined',
                           parsed_uri['scheme'])
            return False

        return delete_func(parsed_uri, **kwargs)
Beispiel #13
0
    def _parse_map_uri(self):
        """
        Parse and validate the map URI, for map-reduce processing.

        The map URI can point to the output URIs of previous workflow steps.

        The map URI template value can take the following forms:
            {workflow->input-name}: 'input-name' must be part of workflow-level
                inputs (i.e., self._inputs)
            {step-name->output}: 'step-name' must be a valid step name, and
                must be listed in the 'depend' list.

        Args:
            self: class instance.

        Returns:
            On success: True.
            On failure: False.

        """
        if not self._step['map']['uri']:
            # map URI is an optional definition field
            self._map_uri = ''
        else:
            match = re.match(r'{([^{}]+)->([^{}]+)}', self._step['map']['uri'])
            if match:
                if match.group(1) == 'workflow': # use workflow-level input uri
                    # check if uri name is in input list
                    if match.group(2) in self._inputs:
                        # make sure the input URI to be used as the map URI
                        # is valid
                        self._parsed_map_uri = URIParser.parse(
                            self._inputs[match.group(2)]
                        )
                        if not self._parsed_map_uri:
                            msg = 'invalid map uri for inputs.{}: {}'\
                                .format(
                                    match.group(2),
                                    self._inputs[match.group(2)]
                                )
                            Log.an().error(msg)
                            return self._fatal(msg)

                        self._map_uri = self._parsed_map_uri['chopped_uri']

                    else:
                        msg = 'invalid template reference to input: {}'\
                            .format(self._step['map']['uri'])
                        Log.an().error(msg)
                        return self._fatal(msg)

                else: # use uri from previous step
                    # check if previous step is a dependency
                    if match.group(1) in self._step['depend']:
                        if match.group(2) == 'output':
                            self._map_uri = self._depend_uris[match.group(1)]\
                                ['chopped_uri']
                            self._parsed_map_uri \
                                = self._depend_uris[match.group(1)]

                        else:
                            msg = 'invalid template reference, must be "output": {}'\
                                .format(self._step['map']['uri'])
                            Log.an().error(msg)
                            return self._fatal(msg)

                    else:
                        # error, not a dependency
                        msg = 'template reference to step must be listed as dependent: {}'\
                            .format(self._step['map']['uri'])
                        Log.an().error(msg)
                        return self._fatal(msg)

            else:
                # invalid format
                msg = 'invalid template value for step map uri: {}'.format(
                    self._step['map']['uri']
                )
                Log.an().error(msg)
                return self._fatal(msg)

        return True
Beispiel #14
0
    def _init_job_uris(self):
        """
        Initialize all work and output URIs.

        Args:
            self: class instance

        Returns:
            On success: True.
            On failure: False.

        """
        # name of the job directory
        job_dir = slugify(self._job['name'], regex_pattern=r'[^-a-z0-9_]+')
        job_dir_hash = '{}-{}'.format(job_dir, self._job['job_id'][:8])

        # validate work URI for each exec context
        #   use the 'data_scheme' for each execution context
        #   and place into a set to remove repeats
        for context in {
                Contexts.get_data_scheme_of_exec_context(con)
                for con in self._exec_contexts
        }:
            # work_uri must be set for each exec_context
            if context not in self._job['work_uri']:
                msg = 'missing work_uri for context: {}'.format(context)
                Log.an().error(msg)
                return self._fatal(msg)

            parsed_uri = URIParser.parse(self._job['work_uri'][context])
            if not parsed_uri:
                msg = 'invalid base of job work uri for context: {}->{}'.format(
                    context, self._job['work_uri'][context]
                )
                Log.an().error(msg)
                return self._fatal(msg)

            # append hashed job dir to each context
            full_job_work_uri = (
                '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}'
            ).format(parsed_uri['chopped_uri'], job_dir_hash)

            # validate again after appending
            parsed_job_work_uri = URIParser.parse(full_job_work_uri)

            if not parsed_job_work_uri:
                msg = 'invalid job work uri for context: {}->{}'.format(
                    context, full_job_work_uri
                )
                Log.an().error(msg)
                return self._fatal(msg)

            self._parsed_job_work_uri[context] = parsed_job_work_uri


        # validate output URI
        parsed_uri = URIParser.parse(self._job['output_uri'])
        if not parsed_uri:
            msg = 'invalid base of job output uri: {}'.format(
                self._job['output_uri']
            )
            Log.an().error(msg)
            return self._fatal(msg)

        # append job dir (hashed or not) to output uri
        full_job_output_uri = (
            '{}{}' if parsed_uri['chopped_path'] == '/' else '{}/{}'
        ).format(
            parsed_uri['chopped_uri'],
            job_dir if self._job['no_output_hash'] else job_dir_hash
        )

        # validate again after appending
        parsed_job_output_uri = URIParser.parse(full_job_output_uri)

        if not parsed_job_output_uri:
            msg = 'invalid job output uri: {}'.format(
                full_job_output_uri
            )
            Log.an().error(msg)
            return self._fatal(msg)

        self._parsed_job_output_uri = parsed_job_output_uri

        return True
Beispiel #15
0
    def _init_context_uris(self):
        """
        Generate all context URIs for this workflow run.

        Context URIs are generated based on contexts given in
        _parsed_job_work_uri, and the "final" context for steps given in the
        _parsed_job_output_uri.

        Args:
            None.

        Returns:
            On failure: Raises WorkflowDAGException.

        """
        self._context_uris['inputs'] = {}
        self._context_uris['steps'] = {'final': {}}
        self._parsed_context_uris['inputs'] = {}
        self._parsed_context_uris['steps'] = {'final': {}}

        # init contexts in parsed_job_work_uri for inputs and steps
        for context in self._parsed_job_work_uri:

            self._context_uris['inputs'][context] = {}
            self._context_uris['steps'][context] = {}
            self._parsed_context_uris['inputs'][context] = {}
            self._parsed_context_uris['steps'][context] = {}

            for node_name in self._topo_sort:

                node = self._graph.nodes[node_name]
                if node['type'] == 'input':
                    if node['source_context'] == context:
                        # use original input URI
                        parsed_uri = URIParser.parse(
                            self._workflow['inputs'][node['name']]['value'])
                        if not parsed_uri:
                            msg = 'invalid input uri: {}'.format(
                                self._workflow['inputs'][
                                    node['name']]['value'])
                            raise WorkflowDAGException(msg)

                        self._context_uris['inputs'][context][node['name']]\
                            = parsed_uri['chopped_uri']
                        self._parsed_context_uris['inputs'][context]\
                            [node['name']] = parsed_uri

                    else:
                        # switch context of input URI
                        new_base_uri = '{}/_input-{}'.format(
                            self._parsed_job_work_uri[context]['chopped_uri'],
                            slugify(node['name']))

                        # create new base URI
                        if not DataManager.mkdir(
                                uri=new_base_uri,
                                recursive=True,
                                **{context: self._context_options[context]}):
                            msg = 'cannot create new base uri for input: {}'\
                                .format(new_base_uri)
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                        # switch input URI base
                        switched_uri = URIParser.switch_context(
                            self._workflow['inputs'][node['name']]['value'],
                            new_base_uri)
                        if not switched_uri:
                            msg = (
                                'cannot switch input uri context to '
                                'new base URI: {}->{}'
                            ).format(
                                self._workflow['inputs'][node['name']]\
                                    ['value'],
                                new_base_uri
                            )
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                        self._context_uris['inputs'][context][node['name']]\
                            = switched_uri['chopped_uri']
                        self._parsed_context_uris['inputs'][context]\
                            [node['name']] = switched_uri

                else:  # node['type'] == 'step'
                    self._context_uris['steps'][context][node['name']]\
                        = '{}/{}'.format(
                            self._parsed_job_work_uri[context]['chopped_uri'],
                            slugify(node['name'])
                        )
                    self._parsed_context_uris['steps'][context][node['name']]\
                        = URIParser.parse(
                            self._context_uris['steps'][context][node['name']]
                        )

        # init final contexts for steps
        for node_name in self._topo_sort:

            node = self._graph.nodes[node_name]

            if node['type'] == 'step':
                self._context_uris['steps']['final'][node['name']]\
                    = '{}/{}'.format(
                        self._parsed_job_output_uri['chopped_uri'],
                        slugify(node['name'])
                    )
                self._parsed_context_uris['steps']['final'][node['name']]\
                    = URIParser.parse(
                        self._context_uris['steps']['final'][node['name']]
                    )
Beispiel #16
0
    def register_agave_app(self, agave, agave_config, agave_params, agave_publish):
        """
        Register app in Agave.

        Args:
            self: class instance

        Returns:
            On success: True.
            On failure: False.

        """
        Log.some().info('registering agave app %s', str(self._path))
        Log.some().info('app version: %s', self._config['version'])

        # compile agave app template
        if not TemplateCompiler.compile_template(
                self._path,
                'agave-app-def.json.j2',
                self._path / 'agave-app-def.json',
                version=self._config['version'],
                agave=agave_params['agave']
        ):
            Log.a().warning(
                'cannot compile agave app "%s" definition from template',
                self._app['name']
            )
            return False

        # create main apps URI
        parsed_agave_apps_uri = URIParser.parse(
            'agave://{}/{}'.format(
                agave_params['agave']['deploymentSystem'],
                agave_params['agave']['appsDir']
            )
        )
        Log.some().info(
            'creating main apps uri: %s',
            parsed_agave_apps_uri['chopped_uri']
        )
        if not DataManager.mkdir(
                parsed_uri=parsed_agave_apps_uri,
                recursive=True,
                agave={
                    'agave': agave,
                    'agave_config': agave_config
                }
        ):
            Log.a().warning('cannot create main agave apps uri')
            return False

        # delete app uri if it exists
        parsed_app_uri = URIParser.parse(
            'agave://{}/{}/{}'.format(
                agave_params['agave']['deploymentSystem'],
                agave_params['agave']['appsDir'],
                self._app['folder']
            )
        )
        Log.some().info(
            'deleting app uri if it exists: %s',
            parsed_app_uri['chopped_uri']
        )
        if not DataManager.delete(
                parsed_uri=parsed_app_uri,
                agave={
                    'agave': agave,
                    'agave_config': agave_config
                }
        ):
            # log warning, but ignore.. deleting non-existant uri returns False
            Log.a().warning(
                'cannot delete app uri: %s', parsed_app_uri['chopped_uri']
            )

        # upload app assets
        parsed_assets_uri = URIParser.parse(str(self._path / 'assets'))
        Log.some().info(
            'copying app assets from %s to %s',
            parsed_assets_uri['chopped_uri'],
            parsed_app_uri['chopped_uri']
        )

        if not DataManager.copy(
                parsed_src_uri=parsed_assets_uri,
                parsed_dest_uri=parsed_app_uri,
                local={},
                agave={
                    'agave': agave,
                    'agave_config': agave_config
                }
        ):
            Log.a().warning(
                'cannot copy app assets from %s to %s',
                parsed_assets_uri['chopped_uri'],
                parsed_app_uri['chopped_uri']
            )
            return False

        # upload test script
        parsed_test_uri = URIParser.parse(
            '{}/{}'.format(
                parsed_app_uri['chopped_uri'],
                'test'
            )
        )
        Log.some().info(
            'creating test uri: %s', parsed_test_uri['chopped_uri']
        )
        if not DataManager.mkdir(
                parsed_uri=parsed_test_uri,
                recursive=True,
                agave={
                    'agave': agave,
                    'agave_config': agave_config
                }
        ):
            Log.a().warning(
                'cannot create test uri: %s', parsed_test_uri['chopped_uri']
            )
            return False

        parsed_local_test_script = URIParser.parse(
            str(self._path / 'test' / 'test.sh')
        )
        parsed_agave_test_script = URIParser.parse(
            '{}/{}'.format(parsed_test_uri['chopped_uri'], 'test.sh')
        )
        Log.some().info(
            'copying test script from %s to %s',
            parsed_local_test_script['chopped_uri'],
            parsed_agave_test_script['chopped_uri']
        )
        if not DataManager.copy(
                parsed_src_uri=parsed_local_test_script,
                parsed_dest_uri=parsed_agave_test_script,
                local={},
                agave={
                    'agave': agave,
                    'agave_config': agave_config
                }
        ):
            Log.a().warning(
                'cannot copy test script from %s to %s',
                parsed_local_test_script['chopped_uri'],
                parsed_agave_test_script['chopped_uri']
            )
            return False

        # update existing app, or register new app
        Log.some().info('registering agave app')

        app_definition = self._yaml_to_dict(
            str(self._path / 'agave-app-def.json')
        )
        if not app_definition:
            Log.a().warning(
                'cannot load agave app definition: %s',
                str(self._path / 'agave-app-def.json')
            )
            return False

        agwrap = AgaveAppsAddUpdate(
            agave, agave_config
        )
        app_add_result = agwrap.call(app_definition)
        if not app_add_result:
            Log.a().warning(
                'cannot register agave app:\n%s', pprint.pformat(app_definition)
            )
            return False

        register_result = {}

        # publish app
        if agave_publish:
            Log.some().info('publishing agave app')

            agwrap = AgaveAppsPublish(
                agave, agave_config
            )
            app_publish_result = agwrap.call(app_add_result['id'])
            if not app_publish_result:
                Log.a().warning(
                    'cannot publish agave app: %s', app_add_result['id']
                )
                return False

            # return published id and revision
            register_result = {
                'id': app_publish_result['id'],
                'version': self._config['version'],
                'revision': 'u{}'.format(app_publish_result['revision'])
            }

        else:
            # return un-published id and blank revision
            register_result = {
                'id': app_add_result['id'],
                'version': self._config['version'],
                'revision': ''
            }

        return register_result
Beispiel #17
0
    def _get_map_uri_list(self):
        """
        Get the contents of the map URI (agave URI).

        Args:
            self: class instance.

        Returns:
            Array of base file names in the map URI. Returns False on
            exception.

        """
        combined_file_list = []
        for uri in self._parsed_map_uris:
            # make sure map URI is compatible scheme (agave)
            if uri['scheme'] != 'agave':
                msg = 'invalid map uri scheme for this step: {}'.format(
                    uri['scheme']
                )
                Log.an().error(msg)
                return self._fatal(msg)

            # get file list from URI
            file_list = DataManager.list(
                parsed_uri=uri,
                globstr=self._step['map']['glob'],
                agave=self._agave
            )
            if file_list is False:
                msg = 'cannot get contents of map uri: {}'\
                    .format(uri['chopped_uri'])
                Log.an().error(msg)
                return self._fatal(msg)

            if self._step['map']['inclusive']:
                # filter with glob
                if glob.globfilter(
                    [uri['name']],
                    self._step['map']['glob'],
                    flags=glob.EXTGLOB|glob.GLOBSTAR
                ):
                    combined_file_list.append({
                        'chopped_uri': '{}://{}{}'.format(
                            uri['scheme'],
                            uri['authority'],
                            uri['folder']
                        ),
                        'filename': uri['name']
                    })

            for f in file_list:
                if '/' in f:
                    # reparse uri to correctly represent recursive elements
                    new_uri = URIParser.parse('{}/{}'.format(uri['chopped_uri'], f))
                    combined_file_list.append({
                        'chopped_uri': '{}://{}{}'.format(
                            new_uri['scheme'],
                            new_uri['authority'],
                            new_uri['folder']
                        ),
                        'filename': new_uri['name']
                    })
                else:
                    combined_file_list.append({
                        'chopped_uri': uri['chopped_uri'],
                        'filename': f
                    })

        return combined_file_list
Beispiel #18
0
def step_impl(context):

    for uri in context.uris:
        parsed_uri = URIParser.parse(uri)
        assert parsed_uri
        context.uris[uri] = parsed_uri
Beispiel #19
0
    def _run_map(self, map_item):
        """
        Run a job for each map item and store the proc and PID.

        Args:
            self: class instance.
            map_item: map item object (item of self._map).

        Returns:
            On success: True.
            On failure: False.

        """
        # load default app inputs, overwrite with template inputs
        inputs = {}
        for input_key in self._app['inputs']:
            if input_key in map_item['template']:
                inputs[input_key] = map_item['template'][input_key]
            else:
                if self._app['inputs'][input_key]['default']:
                    inputs[input_key] = self._app['inputs'][input_key]['default']

        # load default app parameters, overwrite with template parameters
        parameters = {}
        for param_key in self._app['parameters']:
            if param_key in map_item['template']:
                parameters[param_key] = map_item['template'][param_key]
            else:
                if self._app['parameters'][param_key]['default'] not in [None, '']:
                    parameters[param_key] \
                        = self._app['parameters'][param_key]['default']

        # construct shell command
        cmd = self._app['implementation']['local']['script']
        for input_key in inputs:
            if inputs[input_key]:
                cmd += ' --{}="{}"'.format(
                    input_key,
                    URIParser.parse(inputs[input_key])['chopped_path']
                )
        for param_key in parameters:
            if param_key == 'output':
                cmd += ' --output="{}/{}"'.format(
                    self._parsed_data_uris[self._source_context]\
                        ['chopped_path'],
                    parameters['output']
                )

            else:
                cmd += ' --{}="{}"'.format(
                    param_key, parameters[param_key]
                )

        # add exeuction method
        cmd += ' --exec_method="{}"'.format(self._step['execution']['method'])

        # specify execution init commands if 'init' param given
        if 'init' in self._step['execution']['parameters']:
            cmd += ' --exec_init="{}"'.format(self._step['execution']['parameters']['init'])

        # add stdout and stderr
        log_path = '{}/_log/gf-{}-{}-{}'.format(
            self._parsed_data_uris[self._source_context]['chopped_path'],
            map_item['attempt'],
            slugify(self._step['name'], regex_pattern=r'[^-a-z0-9_]+'),
            slugify(map_item['template']['output'], regex_pattern=r'[^-a-z0-9_]+')
        )
        cmd += ' > "{}.out" 2> "{}.err"'.format(log_path, log_path)

        Log.a().debug('command: %s', cmd)

        # launch process
        proc = ShellWrapper.spawn(cmd)
        if proc is False:
            msg = 'shell process error: {}'.format(cmd)
            Log.an().error(msg)
            return self._fatal(msg)

        # record job info
        map_item['run'][map_item['attempt']]['proc'] = proc
        map_item['run'][map_item['attempt']]['pid'] = proc.pid

        # set status of process
        map_item['status'] = 'RUNNING'
        map_item['run'][map_item['attempt']]['status'] = 'RUNNING'

        return True
Beispiel #20
0
    def _init_context_uris(self):
        """
        Generate all context URIs for this workflow run.

        Context URIs are generated based on contexts given in
        _parsed_job_work_uri, and the "final" context for steps given in the
        _parsed_job_output_uri.

        Args:
            None.

        Returns:
            On failure: Raises WorkflowDAGException.

        """
        self._context_uris['inputs'] = {}
        self._context_uris['steps'] = {'final': {}}
        self._parsed_context_uris['inputs'] = {}
        self._parsed_context_uris['steps'] = {'final': {}}

        # init all data contexts
        for context in {
                Contexts.get_data_scheme_of_exec_context(con)
                for con in self._exec_contexts
        } | self._data_contexts:

            self._context_uris['inputs'][context] = {}
            self._parsed_context_uris['inputs'][context] = {}

            for node_name in self._topo_sort:

                node = self._graph.nodes[node_name]
                if node['type'] == 'input':
                    if node['source_context'] == context:
                        # use original input URI
                        parsed_uri = URIParser.parse(
                            self._workflow['inputs'][node['name']]['value'])
                        if not parsed_uri:
                            msg = 'invalid input uri: {}'.format(
                                self._workflow['inputs'][
                                    node['name']]['value'])
                            raise WorkflowDAGException(msg)

                        self._context_uris['inputs'][context][node['name']]\
                            = parsed_uri['chopped_uri']
                        self._parsed_context_uris['inputs'][context]\
                            [node['name']] = parsed_uri

                    else:
                        # skip if _parsed_job_work_uri is not defined for this context
                        # this implies that there is no execution defined for that context,
                        # so no need to setup the data staging location at the work_uri
                        if context not in self._parsed_job_work_uri:
                            continue

                        # switch context of input URI
                        new_base_uri = '{}/_input-{}'.format(
                            self._parsed_job_work_uri[context]['chopped_uri'],
                            slugify(node['name'],
                                    regex_pattern=r'[^-a-z0-9_]+'))

                        # create new base URI
                        if not DataManager.mkdir(
                                uri=new_base_uri,
                                recursive=True,
                                **{context: self._context_options[context]}):
                            msg = 'cannot create new base uri for input: {}'\
                                .format(new_base_uri)
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                        # switch input URI base
                        switched_uri = URIParser.switch_context(
                            self._workflow['inputs'][node['name']]['value'],
                            new_base_uri)
                        if not switched_uri:
                            msg = (
                                'cannot switch input uri context to '
                                'new base URI: {}->{}'
                            ).format(
                                self._workflow['inputs'][node['name']]\
                                    ['value'],
                                new_base_uri
                            )
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                        self._context_uris['inputs'][context][node['name']]\
                            = switched_uri['chopped_uri']
                        self._parsed_context_uris['inputs'][context]\
                            [node['name']] = switched_uri

        for context in {
                Contexts.get_data_scheme_of_exec_context(con)
                for con in self._exec_contexts
        }:

            self._context_uris['steps'][context] = {}
            self._parsed_context_uris['steps'][context] = {}

            for node_name in self._topo_sort:

                node = self._graph.nodes[node_name]
                if node['type'] == 'step':
                    self._context_uris['steps'][context][node['name']]\
                        = '{}/{}'.format(
                            self._parsed_job_work_uri[context]['chopped_uri'],
                            slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+')
                        )
                    self._parsed_context_uris['steps'][context][node['name']]\
                        = URIParser.parse(
                            self._context_uris['steps'][context][node['name']]
                        )

        # init final contexts for steps
        for node_name in self._topo_sort:

            node = self._graph.nodes[node_name]

            if node['type'] == 'step':
                self._context_uris['steps']['final'][node['name']]\
                    = '{}/{}'.format(
                        self._parsed_job_output_uri['chopped_uri'],
                        slugify(node['name'], regex_pattern=r'[^-a-z0-9_]+')
                    )
                self._parsed_context_uris['steps']['final'][node['name']]\
                    = URIParser.parse(
                        self._context_uris['steps']['final'][node['name']]
                    )
Beispiel #21
0
def run(args, other_args, subparser):
    """
    Run GeneFlow workflow engine.

    Args:
        args.workflow_path: workflow definition or package directory.
        args.job: path to job definition

    Returns:
        On success: True.
        On failure: False.

    """
    # get absolute path to workflow
    workflow_path = resolve_workflow_path(args.workflow_path)
    if workflow_path:
        Log.some().info('workflow definition found: %s', workflow_path)
    else:
        Log.an().error('cannot find workflow definition: %s',
                       args.workflow_path)
        return False

    # setup environment
    env = Environment(workflow_path=workflow_path)
    if not env.initialize():
        Log.an().error('cannot initialize geneflow environment')
        return False

    # create default config file and SQLite db
    cfg = Config()
    cfg.default(env.get_sqlite_db_path())
    cfg.write(env.get_config_path())
    config_dict = cfg.config('local')

    # load workflow into db
    try:
        data_source = DataSource(config_dict['database'])
    except DataSourceException as err:
        Log.an().error('data source initialization error [%s]', str(err))
        return False

    defs = data_source.import_definition(workflow_path)
    if not defs:
        Log.an().error('workflow definition load failed: %s', workflow_path)
        return False

    if not defs['workflows']:
        Log.an().error('workflow definition load failed: %s', workflow_path)
        return False

    data_source.commit()

    for workflow in defs['workflows']:
        Log.some().info('workflow loaded: %s -> %s', workflow,
                        defs['workflows'][workflow])

    # get workflow definition back from database to ensure
    # that it's a valid definition
    workflow_id = next(iter(defs['workflows'].values()))
    workflow_dict = data_source.get_workflow_def_by_id(workflow_id)
    if not workflow_dict:
        Log.an().error(
            'cannot get workflow definition from data source: workflow_id=%s',
            workflow_id)
        return False

    ### define arg parsing methods
    def parse_dynamic_args(workflow_dict):
        """
        Parse dynamic args based on workflow dictionary as well as
        some static args.

        Args:
            other_args: List of remaining args from initial parse of
                workflow path.
            workflow_dict: Workflow dictionary

        Returns:
            On success: List of parsed arguments.
            On failure: False.

        """
        # parse dynamic args. these are determined from workflow definition
        dynamic_parser = argparse.ArgumentParser()

        dynamic_parser.add_argument('-j',
                                    '--job',
                                    type=str,
                                    default=None,
                                    dest='job_path',
                                    help='Job Definition(s)')
        for input_key in workflow_dict['inputs']:
            dynamic_parser.add_argument(
                '--in.{}'.format(input_key),
                dest='inputs.{}'.format(input_key),
                required=False,
                default=workflow_dict['inputs'][input_key]['default'],
                help=workflow_dict['inputs'][input_key]['label'])
        for param_key in workflow_dict['parameters']:
            dynamic_parser.add_argument(
                '--param.{}'.format(param_key),
                dest='parameters.{}'.format(param_key),
                required=False,
                default=workflow_dict['parameters'][param_key]['default'],
                help=workflow_dict['parameters'][param_key]['label'])
        dynamic_parser.add_argument('-o',
                                    '--output',
                                    type=str,
                                    default='~/geneflow-output',
                                    help='Output Folder')
        dynamic_parser.add_argument('-n',
                                    '--name',
                                    type=str,
                                    default='geneflow-job',
                                    help='Name of Job')
        dynamic_parser.add_argument('-w',
                                    '--work',
                                    nargs='+',
                                    type=str,
                                    default=[],
                                    help='Work Directory')
        dynamic_parser.add_argument('--exec-context',
                                    '--ec',
                                    nargs='+',
                                    type=str,
                                    dest='exec_context',
                                    default=[],
                                    help='Execution Contexts')
        dynamic_parser.add_argument('--exec-method',
                                    '--em',
                                    nargs='+',
                                    type=str,
                                    dest='exec_method',
                                    default=[],
                                    help='Execution Methods')
        dynamic_parser.add_argument('--exec-param',
                                    '--ep',
                                    nargs='+',
                                    type=str,
                                    dest='exec_param',
                                    default=[],
                                    help='Execution Parameters')

        dynamic_args = dynamic_parser.parse_known_args(other_args)

        return dynamic_args[0]

    if 'gooey' in sys.modules:

        @Gooey(program_name='GeneFlow: {}'.format(workflow_dict['name']),
               program_description=workflow_dict['description'],
               target='gf --log-level={} run {}'.format(
                   args.log_level, args.workflow_path),
               monospace_display=True)
        def parse_dynamic_args_gui(workflow_dict):
            """
            Parse dynamic args based on workflow dictionary as well as
            some static args. Display a GUI interface.

            Args:
                other_args: List of remaining args from initial parse of
                    workflow path.
                workflow_dict: Workflow dictionary

            Returns:
                On success: List of parsed arguments.
                On failure: False.

            """
            # parse dynamic args. these are determined from workflow definition
            dynamic_parser = GooeyParser()
            input_group = dynamic_parser.add_argument_group(
                "Workflow Inputs",
                "Files or folders to be passed to the workflow")
            for input_key in workflow_dict['inputs']:
                widget = 'FileChooser'
                if workflow_dict['inputs'][input_key]['type'] == 'Directory':
                    widget = 'DirChooser'
                input_group.add_argument(
                    '--in.{}'.format(input_key),
                    dest='inputs.{}'.format(input_key),
                    required=False,
                    default=workflow_dict['inputs'][input_key]['default'],
                    help=workflow_dict['inputs'][input_key]['label'],
                    widget=widget)
            param_group = dynamic_parser.add_argument_group(
                "Workflow Parameters",
                "Number or string parameters to be passed to the workflow")
            for param_key in workflow_dict['parameters']:
                param_group.add_argument(
                    '--param.{}'.format(param_key),
                    dest='parameters.{}'.format(param_key),
                    required=False,
                    default=workflow_dict['parameters'][param_key]['default'],
                    help=workflow_dict['parameters'][param_key]['label'])
            job_group = dynamic_parser.add_argument_group(
                "Job Options", "Output/intermediate folders and job name")
            job_group.add_argument('-o',
                                   '--output',
                                   type=str,
                                   default='~/geneflow-output',
                                   help='Output Folder',
                                   widget='DirChooser')
            job_group.add_argument('-n',
                                   '--name',
                                   type=str,
                                   default='geneflow-job',
                                   help='Name of Job')
            job_group.add_argument('-w',
                                   '--work',
                                   nargs='+',
                                   type=str,
                                   default=[],
                                   help='Work Directory')
            exec_group = dynamic_parser.add_argument_group(
                "Execution Options", "Customize workflow execution")
            exec_group.add_argument('--exec-context',
                                    '--ec',
                                    nargs='+',
                                    type=str,
                                    dest='exec_context',
                                    default=[],
                                    help='Execution Contexts')
            exec_group.add_argument('--exec-method',
                                    '--em',
                                    nargs='+',
                                    type=str,
                                    dest='exec_method',
                                    default=[],
                                    help='Execution Methods')
            exec_group.add_argument('--exec-param',
                                    '--ep',
                                    nargs='+',
                                    type=str,
                                    dest='exec_param',
                                    default=[],
                                    help='Execution Parameters')

            dynamic_args = dynamic_parser.parse_args(other_args)

            return dynamic_args

    # get dynamic args
    if args.gui and 'gooey' in sys.modules:
        dynamic_args = parse_dynamic_args_gui(workflow_dict)
    else:
        dynamic_args = parse_dynamic_args(workflow_dict)

    # get absolute path to job file if provided
    job_path = None
    if dynamic_args.job_path:
        job_path = Path(dynamic_args.job_path).absolute()

    # load job definition if provided
    jobs_dict = {}
    gf_def = Definition()
    if job_path:
        if not gf_def.load(job_path):
            Log.an().error('Job definition load failed')
            return False
        jobs_dict = gf_def.jobs()
    else:
        # create default definition
        jobs_dict = {
            'job': {
                'name': 'GeneFlow job',
                'output_uri': 'geneflow_output',
                'work_uri': {
                    'local': '~/.geneflow/work'
                }
            }
        }

    # override with known cli parameters
    apply_job_modifiers(jobs_dict, [
        'name={}'.format(dynamic_args.name), 'output_uri={}'.format(
            dynamic_args.output)
    ])

    # insert workflow name into job, if not provided
    workflow_name = next(iter(defs['workflows']))
    for job in jobs_dict.values():
        if 'workflow_name' not in job:
            job['workflow_name'] = workflow_name

    # add inputs and parameters to job definition
    apply_job_modifiers(
        jobs_dict,
        [
            '{}={}'.format(dynamic_arg, getattr(dynamic_args, dynamic_arg))
            for dynamic_arg in vars(dynamic_args) \
                if dynamic_arg.startswith('inputs.') or dynamic_arg.startswith('parameters.')
        ]
    )

    # add work URIs to job definition
    work_uris = {}
    for work_arg in dynamic_args.work:
        parsed_work_uri = URIParser.parse(work_arg)
        if not parsed_work_uri:
            # skip if invalid URI
            Log.a().warning('invalid work uri: %s', work_arg)
        else:
            work_uris[
                parsed_work_uri['scheme']] = parsed_work_uri['chopped_uri']

    apply_job_modifiers(jobs_dict, [
        'work_uri.{}={}'.format(context, work_uris[context])
        for context in work_uris
    ])

    # add execution options to job definition
    apply_job_modifiers(jobs_dict, [
        'execution.context.{}={}'.format(*exec_arg.split(':', 1)[0:2])
        for exec_arg in dynamic_args.exec_context
    ] + [
        'execution.method.{}={}'.format(*exec_arg.split(':', 1)[0:2])
        for exec_arg in dynamic_args.exec_method
    ] + [
        'execution.parameters.{}={}'.format(*exec_arg.split(':', 1)[0:2])
        for exec_arg in dynamic_args.exec_param
    ])

    # get default values from workflow definition
    for job in jobs_dict.values():
        if 'inputs' not in job:
            job['inputs'] = {}
        if 'parameters' not in job:
            job['parameters'] = {}
        for input_key in workflow_dict['inputs']:
            if input_key not in job['inputs']:
                job['inputs'][input_key]\
                    = workflow_dict['inputs'][input_key]['default']
        for param_key in workflow_dict['parameters']:
            if param_key not in job['parameters']:
                job['parameters'][param_key]\
                    = workflow_dict['parameters'][param_key]['default']

    # expand URIs
    for job in jobs_dict.values():
        # output URI
        parsed_uri = URIParser.parse(job['output_uri'])
        if not parsed_uri:
            Log.an().error('invalid output uri: %s', job['output_uri'])
            return False
        # expand relative path if local
        if parsed_uri['scheme'] == 'local':
            job['output_uri'] = str(
                Path(parsed_uri['chopped_path']).expanduser().resolve())
        # work URIs
        for context in job['work_uri']:
            parsed_uri = URIParser.parse(job['work_uri'][context])
            if not parsed_uri:
                Log.an().error('invalid work uri: %s', job['work_uri'])
                return False
            # expand relative path if local
            if parsed_uri['scheme'] == 'local':
                job['work_uri'][context] = str(
                    Path(parsed_uri['chopped_path']).expanduser().resolve())
        # input URIs
        for input_key in job['inputs']:
            parsed_uri = URIParser.parse(job['inputs'][input_key])
            if not parsed_uri:
                Log.an().error('invalid input uri: %s',
                               job['inputs'][input_key])
                return False
            # expand relative path if local
            if parsed_uri['scheme'] == 'local':
                job['inputs'][input_key] = str(
                    Path(parsed_uri['chopped_path']).expanduser().resolve())

    # import jobs into database
    job_ids = data_source.import_jobs_from_dict(jobs_dict)
    if job_ids is False:
        Log.an().error('cannot import jobs')
        return False
    data_source.commit()

    # create process pool to run workflows in parallel
    pool = Pool(min(5, len(job_ids)))
    jobs = [{'name': job, 'id': job_ids[job], 'log': None} for job in job_ids]

    result = pool.map(
        partial(geneflow.cli.common.run_workflow,
                config=config_dict,
                log_level=args.log_level), jobs)

    pool.close()
    pool.join()

    if not all(result):
        Log.an().error('some jobs failed')

    return result
Beispiel #22
0
    def _init_job_uris(self):
        """
        Initialize all work and output URIs.

        Args:
            self: class instance

        Returns:
            On success: True.
            On failure: False.

        """
        # name of the job directory
        job_dir = '{}-{}'.format(slugify(self._job['name']),
                                 self._job['job_id'][:8])

        # validate work URI for each context
        for context in self._job['work_uri']:

            parsed_uri = URIParser.parse(self._job['work_uri'][context])
            if not parsed_uri:
                msg = 'invalid base of job work uri for context: {}->{}'.format(
                    context, self._job['work_uri'][context])
                Log.an().error(msg)
                return self._fatal(msg)

            # append job dir to each context
            full_job_work_uri = ('{}{}' if parsed_uri['chopped_path'] == '/'
                                 else '{}/{}').format(
                                     parsed_uri['chopped_uri'], job_dir)

            # validate again after appending
            parsed_job_work_uri = URIParser.parse(full_job_work_uri)

            if not parsed_job_work_uri:
                msg = 'invalid job work uri for context: {}->{}'.format(
                    context, full_job_work_uri)
                Log.an().error(msg)
                return self._fatal(msg)

            self._parsed_job_work_uri[context] = parsed_job_work_uri

        # validate output URI
        parsed_uri = URIParser.parse(self._job['output_uri'])
        if not parsed_uri:
            msg = 'invalid base of job output uri: {}'.format(
                self._job['output_uri'])
            Log.an().error(msg)
            return self._fatal(msg)

        # append job dir to each context
        full_job_output_uri = ('{}{}' if parsed_uri['chopped_path'] == '/' else
                               '{}/{}').format(parsed_uri['chopped_uri'],
                                               job_dir)

        # validate again after appending
        parsed_job_output_uri = URIParser.parse(full_job_output_uri)

        if not parsed_job_output_uri:
            msg = 'invalid job output uri: {}'.format(full_job_output_uri)
            Log.an().error(msg)
            return self._fatal(msg)

        self._parsed_job_output_uri = parsed_job_output_uri

        return True
Beispiel #23
0
def run(args):
    """
    Run GeneFlow workflow engine.

    Args:
        args.workflow: workflow definition or package directory.
        args.job_yaml: job definition.

    Returns:
        On success: True.
        On failure: False.

    """
    # get absolute path to workflow
    workflow_yaml = resolve_workflow_path(args.workflow)
    if workflow_yaml:
        Log.some().info('workflow definition found: %s', workflow_yaml)
    else:
        Log.an().error('cannot find workflow definition: %s', args.workflow)
        return False

    # get absolute path to job file if provided
    job_yaml = None
    if args.job_yaml:
        job_yaml = Path(args.job_yaml).absolute()

    # setup environment
    env = Environment(workflow_path=workflow_yaml)
    if not env.initialize():
        Log.an().error('cannot initialize geneflow environment')
        return False

    # create default config file and SQLite db
    cfg = Config()
    cfg.default(env.get_sqlite_db_path())
    cfg.write(env.get_config_path())
    config_dict = cfg.config('local')

    # load workflow into db
    try:
        data_source = DataSource(config_dict['database'])
    except DataSourceException as err:
        Log.an().error('data source initialization error [%s]', str(err))
        return False

    defs = data_source.import_definition(workflow_yaml)
    if not defs:
        Log.an().error('workflow definition load failed: %s', workflow_yaml)
        return False

    if not defs['workflows']:
        Log.an().error('workflow definition load failed: %s', workflow_yaml)
        return False

    data_source.commit()

    for workflow in defs['workflows']:
        Log.some().info(
            'workflow loaded: %s -> %s', workflow, defs['workflows'][workflow]
        )

    # load job definition if provided
    jobs_dict = {}
    gf_def = Definition()
    if job_yaml:
        if not gf_def.load(job_yaml):
            Log.an().error('Job definition load failed')
            return False
        jobs_dict = gf_def.jobs()
    else:
        # create default definition
        jobs_dict = {
            'job': {
                'name': 'GeneFlow job',
                'output_uri': 'geneflow_output',
                'work_uri': {
                    'local': '~/.geneflow/work'
                }
            }
        }

    # override with cli parameters
    if args.data:
        apply_job_modifiers(jobs_dict, args.data)

    # insert workflow name, if not provided
    workflow_name = next(iter(defs['workflows']))
    for job in jobs_dict.values():
        if 'workflow_name' not in job:
            job['workflow_name'] = workflow_name

    # extract workflow defaults for inputs and parameters if not provided
    # in job definition
    workflow_id = next(iter(defs['workflows'].values()))
    workflow_dict = data_source.get_workflow_def_by_id(workflow_id)
    if not workflow_dict:
        Log.an().error(
            'cannot get workflow definition from data source: workflow_id=%s',
            workflow_id
        )
        return False

    for job in jobs_dict.values():
        if 'inputs' not in job:
            job['inputs'] = {}
        if 'parameters' not in job:
            job['parameters'] = {}
        for input_key in workflow_dict['inputs']:
            if input_key not in job['inputs']:
                job['inputs'][input_key]\
                    = workflow_dict['inputs'][input_key]['default']
        for param_key in workflow_dict['parameters']:
            if param_key not in job['parameters']:
                job['parameters'][param_key]\
                    = workflow_dict['parameters'][param_key]['default']

    # expand URIs
    for job in jobs_dict.values():
        # output URI
        parsed_uri = URIParser.parse(job['output_uri'])
        if not parsed_uri:
            Log.an().error('invalid output uri: %s', job['output_uri'])
            return False
        # expand relative path if local
        if parsed_uri['scheme'] == 'local':
            job['output_uri'] = str(
                Path(parsed_uri['chopped_path']).expanduser().resolve()
            )
        # work URIs
        for context in job['work_uri']:
            parsed_uri = URIParser.parse(job['work_uri'][context])
            if not parsed_uri:
                Log.an().error('invalid work uri: %s', job['work_uri'])
                return False
            # expand relative path if local
            if parsed_uri['scheme'] == 'local':
                job['work_uri'][context] = str(
                    Path(parsed_uri['chopped_path']).expanduser().resolve()
                )
        # input URIs
        for input_key in job['inputs']:
            parsed_uri = URIParser.parse(job['inputs'][input_key])
            if not parsed_uri:
                Log.an().error(
                    'invalid input uri: %s', job['inputs'][input_key]
                )
                return False
            # expand relative path if local
            if parsed_uri['scheme'] == 'local':
                job['inputs'][input_key] = str(
                    Path(parsed_uri['chopped_path']).expanduser().resolve()
                )

    # import jobs into database
    job_ids = data_source.import_jobs_from_dict(jobs_dict)
    if job_ids is False:
        Log.an().error('cannot import jobs')
        return False
    data_source.commit()

    # create process pool to run workflows in parallel
    pool = Pool(min(5, len(job_ids)))
    jobs = [
        {
            'name': job,
            'id': job_ids[job],
            'log': None
        } for job in job_ids
    ]

    result = pool.map(
        partial(
            geneflow.cli.common.run_workflow,
            config=config_dict,
            log_level=args.log_level
        ),
        jobs
    )

    pool.close()
    pool.join()

    if not all(result):
        Log.an().error('some jobs failed')

    return result
Beispiel #24
0
    def _init_graph_structure(self):
        """
        Create empty nodes for each workflow input and step.

        Nodes contain attributes for type (e.g., input or step), contexts for
        data staging (e.g., local or agave), source context, and node.
        The node attribute is initialized as None, but will later be a
        reference to a WorkflowInput or WorkflowStep object.

        Args:
            None.

        Returns:
            On failure: Raises WorkflowDAGException.

        """
        # add empty input nodes to graph
        for input_name in self._workflow['inputs']:

            # extract the input source context
            parsed_input_uri = URIParser.parse(
                self._workflow['inputs'][input_name]['value'])
            if not parsed_input_uri:
                msg = 'invalid input uri: {}'.format(
                    self._workflow['inputs'][input_name]['value'])
                Log.an().error(msg)
                raise WorkflowDAGException(msg)

            source_context = parsed_input_uri['scheme']

            try:
                self._graph.add_node('input.{}'.format(input_name),
                                     name='{}'.format(input_name),
                                     type='input',
                                     contexts={source_context: ''},
                                     source_context=source_context,
                                     exec_context=None,
                                     node=None)
            except nx.NetworkXException as err:
                msg = 'cannot add input node "{}" to graph [{}]'.format(
                    input_name, str(err))
                Log.an().error(msg)
                raise WorkflowDAGException(msg)

        # add empty step nodes to graph
        for step_name, step in self._workflow['steps'].items():

            # extract the step source context
            source_data_context = Contexts.get_data_scheme_of_exec_context(
                step['execution']['context'])
            if not source_data_context:
                msg = 'invalid execution context ({}) for step {}'.format(
                    step['execution']['context'], step_name)
                Log.an().error(msg)
                raise WorkflowDAGException(msg)

            contexts = {source_data_context: ''}
            if step_name in self._workflow['final_output']:
                contexts['final'] = ''

            try:
                self._graph.add_node('step.{}'.format(step_name),
                                     name='{}'.format(step_name),
                                     type='step',
                                     step=step,
                                     contexts=contexts,
                                     source_context=source_data_context,
                                     exec_context=step['execution']['context'],
                                     node=None)
            except nx.NetworkXException as err:
                msg = 'cannot add step node "{}" to graph [{}]'.format(
                    step_name, str(err))
                Log.an().error(msg)
                raise WorkflowDAGException(msg)

        # create graph edges and determine contexts for each node based on
        #   dependencies
        for step_name, step in self._workflow['steps'].items():

            # name of this step node
            step_node = 'step.{}'.format(step_name)

            # get all input or step dependencies for this step
            deps = self._get_step_dependencies(step)

            for dep in deps:

                if deps[dep]['name'] == 'workflow':
                    # input or parameter dependency
                    input_node = 'input.{}'.format(deps[dep]['var'])

                    # only add edge if it's an input (not a parameter)
                    if input_node in self._graph.nodes:
                        # add graph edge from input to step
                        try:
                            self._graph.add_edge(input_node, step_node)
                        except nx.NetworkXException as err:
                            msg = ('cannot add edge from node "{}" to '
                                   'node "{}" [{}]').format(
                                       input_node, step_node, str(err))
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                        # add context key to dict for input node
                        self._graph.nodes[input_node]['contexts'][
                            Contexts.get_data_scheme_of_exec_context(
                                step['execution']['context'])] = ''

                    else:
                        # if input not found, make sure var refers to
                        # a parameter
                        if not deps[dep]['var'] in self._parameters:
                            msg = ('invalid dependency for step "{}", '
                                   'parameter or input "{}" does not exist'
                                   ).format(step_name, deps[dep]['var'])
                            Log.an().error(msg)
                            raise WorkflowDAGException(msg)

                else:
                    # step dependency
                    depend_node = 'step.{}'.format(deps[dep]['name'])

                    if not self._graph.has_node(depend_node):
                        msg = ('invalid dependency for step "{}", '
                               'step "{}" does not exist').format(
                                   step_name, depend_node)
                        Log.an().error(msg)
                        raise WorkflowDAGException(msg)

                    # add graph edge from step to step
                    try:
                        self._graph.add_edge(depend_node, step_node)
                    except nx.NetworkXException as err:
                        msg = ('cannot add edge from node "{}" to '
                               'node "{}" [{}]').format(
                                   depend_node, step_node, str(err))
                        Log.an().error(msg)
                        raise WorkflowDAGException(msg)

                    # add context key to dict for depend node
                    self._graph.nodes[depend_node]['contexts'][
                        Contexts.get_data_scheme_of_exec_context(
                            step['execution']['context'])] = ''
Beispiel #25
0
    def _run_map(self, map_item):
        """
        Run a job for each map item and store the job ID.

        Args:
            self: class instance.
            map_item: map item object (item of self._map).

        Returns:
            On success: True.
            On failure: False.

        """
        # load default app inputs, overwrite with template inputs
        inputs = {}
        for input_key in self._app['inputs']:
            if input_key in map_item['template']:
                inputs[input_key] = map_item['template'][input_key]
            else:
                if self._app['inputs'][input_key]['default']:
                    inputs[input_key] = self._app['inputs'][input_key][
                        'default']

        # load default app parameters, overwrite with template parameters
        parameters = {}
        for param_key in self._app['parameters']:
            if param_key in map_item['template']:
                parameters[param_key] = map_item['template'][param_key]
            else:
                if self._app['parameters'][param_key]['default'] not in [
                        None, ''
                ]:
                    parameters[param_key] \
                        = self._app['parameters'][param_key]['default']

        # get full path of wrapper script
        path = shutil.which(self._app['implementation']['local']['script'])
        if not path:
            msg = 'wrapper script not found in path: %s'.format(
                self._app['implementation']['local']['script'])
            Log.an().error(msg)
            return self._fatal(msg)

        # construct argument list for wrapper script
        args = [path]
        for input_key in inputs:
            if inputs[input_key]:
                args.append('--{}={}'.format(
                    input_key,
                    URIParser.parse(inputs[input_key])['chopped_path']))
        for param_key in parameters:
            if param_key == 'output':
                args.append('--output={}/{}'.format(
                    self._parsed_data_uris[self._source_context][0]\
                        ['chopped_path'],
                    parameters['output']
                ))

            else:
                args.append('--{}={}'.format(param_key, parameters[param_key]))

        # add exeuction method
        args.append('--exec_method={}'.format(
            self._step['execution']['method']))

        # specify execution init commands if 'init' param given
        if 'init' in self._step['execution']['parameters']:
            args.append('--exec_init={}'.format(
                self._step['execution']['parameters']['init']))

        Log.a().debug('[step.%s]: command: %s -> %s', self._step['name'],
                      map_item['template']['output'], ' '.join(args))

        # construct job name
        name = 'gf-{}-{}-{}'.format(
            map_item['attempt'],
            slugify(self._step['name'], regex_pattern=r'[^-a-z0-9_]+'),
            slugify(map_item['template']['output'],
                    regex_pattern=r'[^-a-z0-9_]+'))

        # construct paths for logging stdout and stderr
        log_path = '{}/_log/{}'.format(
            self._parsed_data_uris[self._source_context][0]['chopped_path'],
            name)

        # create and populate job template
        jt = self._slurm['drmaa_session'].createJobTemplate()
        jt.remoteCommand = '/bin/bash'
        jt.args = args
        jt.jobName = name
        jt.errorPath = ':{}.err'.format(log_path)
        jt.outputPath = ':{}.out'.format(log_path)

        # pass execution parameters to job template
        native_spec = ' --nodes=1 --ntasks=1'
        if 'queue' in self._step['execution']['parameters']:
            native_spec += ' -p {}'.format(
                self._step['execution']['parameters']['queue'])
        if 'slots' in self._step['execution']['parameters']:
            native_spec += ' --cpus-per-task={}'.format(
                self._step['execution']['parameters']['slots'])
        if 'other' in self._step['execution']['parameters']:
            native_spec += ' {}'.format(
                self._step['execution']['parameters']['other'])
        jt.nativeSpecification = native_spec

        # submit hpc job using drmaa library
        try:
            job_id = self._slurm['drmaa_session'].runJob(jt)

        except drmaa.DrmCommunicationException as err:
            msg = 'cannot submit slurm job for step "{}" [{}]'\
                    .format(self._step['name'], str(err))
            Log.a().warning(msg)

            # set to failed, but return True so that it's retried
            map_item['status'] = 'FAILED'
            map_item['run'][map_item['attempt']]['status'] = 'FAILED'

            return True

        self._slurm['drmaa_session'].deleteJobTemplate(jt)

        Log.a().debug('[step.%s]: hpc job id: %s -> %s', self._step['name'],
                      map_item['template']['output'], job_id)

        # record job info
        map_item['run'][map_item['attempt']]['hpc_job_id'] = job_id

        # set status of process
        map_item['status'] = 'QUEUED'
        map_item['run'][map_item['attempt']]['status'] = 'QUEUED'

        return True