Example #1
0
def get_run_files(run: RunObject, state: WorkflowState, rundir: str) -> Tuple[List[RunFile], List[str]]:
    """Create list of output files for a successful run. The list of files
    depends on whether files are specified in the workflow specification or not.
    If files are specified only those files are included in the returned lists.
    Otherwise, all result files that are listed in the run state are returned.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    state: flowserv.model.workflow.state.WorkflowState
        SUCCESS state for the workflow run.
    rundir: string
        Directory containing run result files.

    Returns
    -------
    list of RunObject, list of string
    """
    filekeys = None
    outputs = run.outputs()
    if outputs:
        # List only existing files for output specifications in the
        # workflow handle. Note that (i) the result of run.outputs() is
        # always a dictionary and (ii) that the keys in the returned
        # dictionary are not necessary equal to the file sources.
        filekeys = [f.source for f in run.outputs().values()]
    else:
        # List all files that were generated by the workflow run as
        # output.
        filekeys = state.files
    # For each run file ensure that it exist before adding a file
    # handle to the run. We use the file system store's walk method to
    # get a list of all files that need to be retained for a run.
    walklist = list()
    for filekey in filekeys:
        filename = os.path.join(rundir, filekey)
        if not os.path.exists(filename):
            continue
        walklist.append((filename, filekey))
    # Get files that will be copied to the file store.
    runfiles = list()
    storefiles = walk(files=walklist)
    for file, filekey in storefiles:
        mime_type, _ = mimetypes.guess_type(url=file.filename)
        rf = RunFile(
            key=filekey,
            name=filekey,
            mime_type=mime_type,
            size=file.size()
        )
        runfiles.append(rf)
    return runfiles, storefiles
Example #2
0
def read_run_results(run: RunObject, schema: ResultSchema, rundir: str):
    """Read the run results from the result file that is specified in the workflow
    result schema. If the file is not found we currently do not raise an error.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    schema: flowserv.model.template.schema.ResultSchema
        Workflow result schema specification that contains the reference to the
        result file key.
    rundir: string
        Directory containing run result files.
    """
    filename = os.path.join(rundir, schema.result_file)
    if os.path.exists(filename):
        results = util.read_object(filename)
        # Create a dictionary of result values.
        values = dict()
        for col in schema.columns:
            val = util.jquery(doc=results, path=col.jpath())
            col_id = col.column_id
            if val is None and col.required:
                msg = "missing value for '{}'".format(col_id)
                raise err.ConstraintViolationError(msg)
            elif val is not None:
                values[col_id] = col.cast(val)
        run.result = values
Example #3
0
def read_run_results(run: RunObject, schema: ResultSchema,
                     runstore: StorageVolume):
    """Read the run results from the result file that is specified in the workflow
    result schema. If the file is not found we currently do not raise an error.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    schema: flowserv.model.template.schema.ResultSchema
        Workflow result schema specification that contains the reference to the
        result file key.
    runstore: flowserv.volume.base.StorageVolume
        Storage volume containing the run (result) files for a successful
        workflow run.
    """
    with runstore.load(schema.result_file).open() as f:
        results = util.read_object(f)
    # Create a dictionary of result values.
    values = dict()
    for col in schema.columns:
        val = util.jquery(doc=results, path=col.jpath())
        col_id = col.column_id
        if val is None and col.required:
            msg = "missing value for '{}'".format(col_id)
            raise err.ConstraintViolationError(msg)
        elif val is not None:
            values[col_id] = col.cast(val)
    run.result = values
Example #4
0
def store_run_files(run: RunObject, files: List[str], source: StorageVolume,
                    target: StorageVolume) -> List[RunFile]:
    """Create list of output files for a successful run. The list of files
    depends on whether files are specified in the workflow specification or not.
    If files are specified only those files are included in the returned lists.
    Otherwise, all result files that are listed in the run state are returned.

    Parameters
    ----------
    run: flowserv.model.base.RunObject
        Handle for a workflow run.
    files: list of string
        List of result files for a successful workflow run.
    source: flowserv.volume.base.StorageVolume
        Storage volume containing the run (result) files for a successful
        workflow run.
    target: flowserv.volume.base.StorageVolume
        Storage volume for persiting run result files.

    Returns
    -------
    list of RunObject, list of string
    """
    outputs = run.outputs()
    if outputs:
        # List only existing files for output specifications in the
        # workflow handle. Note that (i) the result of run.outputs() is
        # always a dictionary and (ii) that the keys in the returned
        # dictionary are not necessary equal to the file sources.
        files = [f.source for f in run.outputs().values()]
    # Copy files to the target volume.
    runfiles = list()
    for key in files:
        f = source.load(key)
        target.store(file=f, dst=key)
        mime_type, _ = mimetypes.guess_type(url=key)
        runfile = RunFile(key=key,
                          name=key,
                          mime_type=mime_type,
                          size=f.size())
        runfiles.append(runfile)
    return runfiles
Example #5
0
def create_run(session, workflow_id, group_id):
    """Create a new group run. Returns the run identifier.

    Parameters
    ----------
    session: sqlalchemy.orm.session.Session
        Database session.
    workflow_id: string
        Unique workflow identifier.
    group_id: string
        Unique group identifier.

    Returns
    -------
    string
    """
    run_id = util.get_unique_identifier()
    run = RunObject(run_id=run_id,
                    workflow_id=workflow_id,
                    group_id=group_id,
                    state_type=st.STATE_PENDING)
    session.add(run)
    return run_id
Example #6
0
    def exec_workflow(
        self, run: RunObject, template: WorkflowTemplate, arguments: Dict,
        staticfs: StorageVolume, config: Optional[Dict] = None
    ) -> Tuple[WorkflowState, StorageVolume]:
        """Initiate the execution of a given workflow template for a set of
        argument values. This will start a new process that executes a serial
        workflow asynchronously. Returns the state of the workflow after the
        process is stated (the state will therefore be RUNNING).

        The set of arguments is not further validated. It is assumed that the
        validation has been performed by the calling code (e.g., the run
        service manager).

        If the state of the run handle is not pending, an error is raised.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Handle for the run that is being executed.
        template: flowserv.model.template.base.WorkflowTemplate
            Workflow template containing the parameterized specification and
            the parameter declarations.
        arguments: dict
            Dictionary of argument values for parameters in the template.
        staticfs: flowserv.volume.base.StorageVolume
            Storage volume that contains the static files from the workflow
            template.
        config: dict, default=None
            Optional configuration settings are currently ignored. Included for
            API completeness.

        Returns
        -------
        flowserv.model.workflow.state.WorkflowState, flowserv.volume.base.StorageVolume
        """
        # Get the run state. Ensure that the run is in pending state.
        if not run.is_pending():
            raise RuntimeError("invalid run state '{}'".format(run.state()))
        try:
            # Create a workflow on the remote engine. This will also upload all
            # necessary files to the remote engine. Workflow execution may not
            # be started (indicated by the state property of the returned
            # handle for the remote workflow).
            workflow = self.client.create_workflow(
                run=run,
                template=template,
                arguments=arguments,
                staticfs=staticfs
            )
            workflow_id = workflow.workflow_id
            # Run the workflow. Depending on the values of the is_async flag
            # the process will either block execution while monitoring the
            # workflow state or not.
            if self.is_async:
                self.tasks[run.run_id] = workflow_id
                # Start monitor tread for asynchronous monitoring.
                monitor.WorkflowMonitor(
                    workflow=workflow,
                    poll_interval=self.poll_interval,
                    service=self.service,
                    tasks=self.tasks
                ).start()
                return workflow.state, workflow.runstore
            else:
                # Run workflow synchronously. This will lock the calling thread
                # while waiting (i.e., polling the remote engine) for the
                # workflow execution to finish.
                state = monitor.monitor_workflow(
                    workflow=workflow,
                    poll_interval=self.poll_interval
                )
                return state, workflow.runstore
        except Exception as ex:
            # Set the workflow runinto an ERROR state
            logging.error(ex, exc_info=True)
            strace = util.stacktrace(ex)
            logging.debug('\n'.join(strace))
            return run.state().error(messages=strace), None
Example #7
0
    def exec_workflow(
            self,
            run: RunObject,
            template: WorkflowTemplate,
            arguments: Dict,
            config: Optional[Dict] = None) -> Tuple[WorkflowState, str]:
        """Initiate the execution of a given workflow template for a set of
        argument values. This will start a new process that executes a serial
        workflow asynchronously.

        The serial workflow engine executes workflows on the local machine and
        therefore uses the file system to store temporary run files. The path
        to the run folder is returned as the second value in the result tuple.
        The first value in the result tuple is the state of the workflow after
        the process is stated. If the workflow is executed asynchronously the
        state will be RUNNING. Otherwise, the run state should be an inactive
        state.

        The set of arguments is not further validated. It is assumed that the
        validation has been performed by the calling code (e.g., the run
        service manager).

        The optional configuration object can be used to override the worker
        configuration that was provided at object instantiation. Expects a
        dictionary with an element `workers` that contains a mapping of container
        identifier to a container worker configuration object.

        If the state of the run handle is not pending, an error is raised.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Handle for the run that is being executed.
        template: flowserv.model.template.base.WorkflowTemplate
            Workflow template containing the parameterized specification and
            the parameter declarations.
        arguments: dict
            Dictionary of argument values for parameters in the template.
        config: dict, default=None
            Optional object to overwrite the worker configuration settings.

        Returns
        -------
        flowserv.model.workflow.state.WorkflowState, string

        Raises
        ------
        flowserv.error.DuplicateRunError
        """
        # Get the run state. Ensure that the run is in pending state
        if not run.is_pending():
            raise RuntimeError("invalid run state '{}'".format(run.state))
        state = run.state()
        rundir = os.path.join(self.runsdir, run.run_id)
        # Get the worker configuration.
        worker_config = self.worker_config if not config else config.get(
            'workers')
        # Get the source directory for static workflow files.
        sourcedir = self.fs.workflow_staticdir(run.workflow.workflow_id)
        # Get the list of workflow steps and the generated output files.
        steps, run_args, outputs = parser.parse_template(template=template,
                                                         arguments=arguments)
        try:
            # Copy template files to the run folder.
            self.fs.copy_folder(key=sourcedir, dst=rundir)
            # Store any given file arguments in the run folder.
            for key, para in template.parameters.items():
                if para.is_file() and key in arguments:
                    file = arguments[key]
                    file.source().store(os.path.join(rundir, file.target()))
            # Create top-level folder for all expected result files.
            util.create_directories(basedir=rundir, files=outputs)
            # Start a new process to run the workflow. Make sure to catch all
            # exceptions to set the run state properly
            state = state.start()
            if self.is_async:
                # Raise an error if the service manager is not given.
                if self.service is None:
                    raise ValueError('service manager not given')
                # Run steps asynchronously in a separate process
                pool = Pool(processes=1)
                task_callback_function = partial(callback_function,
                                                 lock=self.lock,
                                                 tasks=self.tasks,
                                                 service=self.service)
                with self.lock:
                    self.tasks[run.run_id] = (pool, state)
                pool.apply_async(run_workflow,
                                 args=(run.run_id, rundir, state, outputs,
                                       steps, run_args,
                                       WorkerFactory(config=worker_config)),
                                 callback=task_callback_function)
                return state, rundir
            else:
                # Run steps synchronously and block the controller until done
                _, _, state_dict = run_workflow(
                    run_id=run.run_id,
                    rundir=rundir,
                    state=state,
                    output_files=outputs,
                    steps=steps,
                    arguments=run_args,
                    workers=WorkerFactory(config=worker_config))
                return serialize.deserialize_state(state_dict), rundir
        except Exception as ex:
            # Set the workflow runinto an ERROR state
            logging.error(ex)
            return state.error(messages=util.stacktrace(ex)), rundir
Example #8
0
    def create_run(self, workflow=None, group=None, arguments=None, runs=None):
        """Create a new entry for a run that is in pending state. Returns a
        handle for the created run.

        A run is either created for a group (i.e., a grop submission run) or
        for a workflow (i.e., a post-processing run). Only one of the two
        parameters is expected to be None.

        Parameters
        ----------
        workflow: flowserv.model.base.WorkflowObject, default=None
            Workflow handle if this is a post-processing run.
        group: flowserv.model.base.GroupObject
            Group handle if this is a group sumbission run.
        arguments: list
            List of argument values for parameters in the template.
        runs: list(string), default=None
            List of run identifier that define the input for a post-processing
            run.

        Returns
        -------
        flowserv.model.base.RunObject

        Raises
        ------
        ValueError
        flowserv.error.MissingArgumentError
        """
        # Ensure that only group or workflow is given.
        if workflow is None and group is None:
            raise ValueError('missing arguments for workflow or group')
        elif workflow is not None and group is not None:
            raise ValueError('arguments for workflow or group')
        elif group is not None and runs is not None:
            raise ValueError('unexpected argument runs')
        # Create a unique run identifier.
        run_id = util.get_unique_identifier()
        # Get workflow and group identifier.
        if workflow is None:
            workflow_id = group.workflow_id
            group_id = group.group_id
        else:
            workflow_id = workflow.workflow_id
            group_id = None
        # Return handle for the created run.
        run = RunObject(
            run_id=run_id,
            workflow_id=workflow_id,
            group_id=group_id,
            arguments=arguments if arguments is not None else list(),
            state_type=st.STATE_PENDING
        )
        self.session.add(run)
        # Update the workflow handle if this is a post-processing run.
        if workflow is not None:
            ranking = list()
            for i in range(len(runs)):
                ranking.append(WorkflowRankingRun(run_id=runs[i], rank=i))
            workflow.postproc_ranking = ranking
            workflow.postproc_run_id = run_id
        # Commit changes in case run monitors need to access the run state.
        self.session.commit()
        return run
Example #9
0
    def exec_workflow(
            self,
            run: RunObject,
            template: WorkflowTemplate,
            arguments: Dict,
            staticfs: StorageVolume,
            config: Optional[Dict] = None
    ) -> Tuple[WorkflowState, StorageVolume]:
        """Initiate the execution of a given workflow template for a set of
        argument values. This will start a new process that executes a serial
        workflow asynchronously.

        The serial workflow engine executes workflows on the local machine and
        therefore uses the file system to store temporary run files. The path
        to the run folder is returned as the second value in the result tuple.
        The first value in the result tuple is the state of the workflow after
        the process is stated. If the workflow is executed asynchronously the
        state will be RUNNING. Otherwise, the run state should be an inactive
        state.

        The set of arguments is not further validated. It is assumed that the
        validation has been performed by the calling code (e.g., the run
        service manager).

        The optional configuration object can be used to override the worker
        configuration that was provided at object instantiation. Expects a
        dictionary with an element `workers` that contains a mapping of container
        identifier to a container worker configuration object.

        If the state of the run handle is not pending, an error is raised.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Handle for the run that is being executed.
        template: flowserv.model.template.base.WorkflowTemplate
            Workflow template containing the parameterized specification and
            the parameter declarations.
        arguments: dict
            Dictionary of argument values for parameters in the template.
        staticfs: flowserv.volume.base.StorageVolume
            Storage volume that contains the static files from the workflow
            template.
        config: dict, default=None
            Optional object to overwrite the worker configuration settings.

        Returns
        -------
        flowserv.model.workflow.state.WorkflowState, flowserv.volume.base.StorageVolume
        """
        # Get the run state. Raise an error if the run is not in pending state.
        if not run.is_pending():
            raise RuntimeError("invalid run state '{}'".format(run.state))
        state = run.state()
        # Create configuration dictionary that merges the engine global
        # configuration with the workflow-specific one.
        run_config = self.config if self.config is not None else dict()
        if config:
            run_config.update(config)
        # Get the list of workflow steps, run arguments, and the list of output
        # files that the workflow is expected to generate.
        steps, run_args, outputs = parser.parse_template(template=template,
                                                         arguments=arguments)
        # Create and prepare storage volume for run files.
        runstore = self.fs.get_store_for_folder(key=util.join(
            self.runsdir, run.run_id),
                                                identifier=DEFAULT_STORE)
        try:
            # Copy template files to the run folder.
            files = staticfs.copy(src=None, store=runstore)
            # Store any given file arguments and additional input files
            # that are required by actor parameters into the run folder.
            for key, para in template.parameters.items():
                if para.is_file() and key in arguments:
                    for key in arguments[key].copy(target=runstore):
                        files.append(key)
                elif para.is_actor() and key in arguments:
                    input_files = arguments[key].files
                    for f in input_files if input_files else []:
                        for key in f.copy(target=runstore):
                            files.append(key)
            # Create factory objects for storage volumes.
            volumes = volume_manager(specs=run_config.get('volumes', []),
                                     runstore=runstore,
                                     runfiles=files)
            # Create factory for workers. Include mapping of workflow steps to
            # the worker that are responsible for their execution.
            workers = WorkerPool(workers=run_config.get('workers', []),
                                 managers={
                                     doc['step']: doc['worker']
                                     for doc in run_config.get('workflow', [])
                                 })
            # Start a new process to run the workflow. Make sure to catch all
            # exceptions to set the run state properly.
            state = state.start()
            if self.is_async:
                # Run steps asynchronously in a separate process
                pool = Pool(processes=1)
                task_callback_function = partial(callback_function,
                                                 lock=self.lock,
                                                 tasks=self.tasks,
                                                 service=self.service)
                with self.lock:
                    self.tasks[run.run_id] = (pool, state)
                pool.apply_async(run_workflow,
                                 args=(run.run_id, state, outputs, steps,
                                       run_args, volumes, workers),
                                 callback=task_callback_function)
                return state, runstore
            else:
                # Run steps synchronously and block the controller until done
                _, _, state_dict = run_workflow(run_id=run.run_id,
                                                state=state,
                                                output_files=outputs,
                                                steps=steps,
                                                arguments=run_args,
                                                volumes=volumes,
                                                workers=workers)
                return serialize.deserialize_state(state_dict), runstore
        except Exception as ex:
            # Set the workflow run into an ERROR state
            logging.error(ex, exc_info=True)
            return state.error(messages=util.stacktrace(ex)), runstore
Example #10
0
    def run_handle(self,
                   run: RunObject,
                   group: Optional[GroupObject] = None) -> Dict:
        """Get serialization for a run handle. The run handle extends the run
        descriptor with the run arguments, the parameter declaration taken from
        the workflow group handle (since it may differ from the parameter list
        of the workflow), and additional information associated with the run
        state.

        Parameters
        ----------
        run: flowserv.model.base.RunObject
            Workflow run handle
        group: flowserv.model.base.GroupObject, default=None
            Workflow group handle. Missing for post-processing workflows

        Returns
        -------
        dict
        """
        doc = self.run_descriptor(run)
        # Add information about the run workflow and the run group
        doc[RUN_WORKFLOW] = run.workflow_id
        if run.group_id is not None:
            doc[RUN_GROUP] = run.group_id
        # Add run arguments
        doc[RUN_ARGUMENTS] = run.arguments
        # Add group specific parameters
        if group is not None:
            parameters = group.parameters.values()
            doc[RUN_PARAMETERS] = [p.to_dict() for p in parameters]
        # Add additional information from the run state
        if not run.is_pending():
            doc[RUN_STARTED] = run.state().started_at
        if run.is_canceled() or run.is_error():
            doc[RUN_FINISHED] = run.state().stopped_at
            doc[RUN_ERRORS] = run.state().messages
        elif run.is_success():
            doc[RUN_FINISHED] = run.state().finished_at
            output_spec = run.outputs()
            # Serialize file resources. The default serialization contains the
            # file identifier and name. If an output specification is present
            # for the file the values for that specification will be added
            # to the serialization.
            files = list()
            for f in run.files:
                obj = {FILE_ID: f.file_id, FILE_NAME: f.name}
                if f.name in output_spec:
                    fspec = output_spec[f.name]
                    obj[FILE_NAME] = fspec.key
                    if fspec.title is not None:
                        obj[FILE_TITLE] = fspec.title
                    if fspec.caption is not None:
                        obj[FILE_CAPTION] = fspec.caption
                    if fspec.widget is not None:
                        obj[FILE_WIDGET] = fspec.widget
                    if fspec.format is not None:
                        obj[FILE_FORMAT] = fspec.format
                files.append(obj)
            doc[RUN_FILES] = files
        return doc