Beispiel #1
0
    def with_workflow_storage(self):
        """Yield a workflow storage."""
        with self.lock:
            from renku.models.cwl._ascwl import ascwl
            from renku.models.cwl.workflow import Workflow

            workflow = Workflow()
            yield workflow

            for step in workflow.steps:
                step_name = '{0}_{1}.cwl'.format(
                    uuid.uuid4().hex,
                    secure_filename('_'.join(step.run.baseCommand)),
                )

                workflow_path = self.workflow_path
                if not workflow_path.exists():
                    workflow_path.mkdir()

                step_path = workflow_path / step_name
                with step_path.open('w') as step_file:
                    yaml.dump(
                        ascwl(
                            # filter=lambda _, x: not (x is False or bool(x)
                            step.run,
                            filter=lambda _, x: x is not None,
                            basedir=workflow_path,
                        ),
                        stream=step_file,
                        default_flow_style=False)
Beispiel #2
0
def test_workflow(runner, project):
    """Test workflow command."""
    result = runner.invoke(cli.cli, ['run', 'touch', 'data.csv'])
    assert 0 == result.exit_code

    with open('counted.txt', 'w') as stdout:
        with contextlib.redirect_stdout(stdout):
            try:
                cli.cli.main(
                    args=('run', 'wc', 'data.csv'),
                    prog_name=runner.get_default_prog_name(cli.cli),
                )
            except SystemExit as e:
                assert e.code in {None, 0}

    result = runner.invoke(
        cli.cli,
        ['workflow', 'create', 'counted.txt', '-o', 'workflow.cwl'],
        catch_exceptions=False,
    )
    assert 0 == result.exit_code

    with open('workflow.cwl', 'r') as f:
        workflow = Workflow.from_cwl(yaml.safe_load(f))
        assert workflow.steps[0].run.startswith('.renku/workflow/')

    # Compare default log and log for a specific file.
    result_default = runner.invoke(cli.cli, ['log'])
    result_arg = runner.invoke(cli.cli, ['log', 'counted.txt'])

    assert 0 == result_default.exit_code
    assert 0 == result_arg.exit_code
    assert result_default.output == result_arg.output
Beispiel #3
0
def test_workflow(runner):
    """Test workflow command."""
    result = runner.invoke(cli.cli, ['run', 'touch', 'data.csv'])
    assert result.exit_code == 0

    with open('counted.txt', 'w') as stdout:
        with contextlib.redirect_stdout(stdout):
            try:
                cli.cli.main(
                    args=('run', 'wc', 'data.csv'),
                    prog_name=runner.get_default_prog_name(cli.cli),
                )
            except SystemExit as e:
                assert e.code in {None, 0}

    result = runner.invoke(
        cli.cli, ['workflow', 'create', 'counted.txt', '-o', 'workflow.cwl'])
    assert result.exit_code == 0

    with open('workflow.cwl', 'r') as f:
        workflow = Workflow.from_cwl(yaml.load(f))
        assert workflow.steps[0].run.startswith('.renku/workflow/')
Beispiel #4
0
    def add_tool(self,
                 commit,
                 path,
                 file_key=None,
                 expand_workflow=True,
                 is_step=False):
        """Add a tool and its dependencies to the graph."""
        data = (commit.tree / path).data_stream.read()
        cwl = yaml.load(data)

        try:
            tool = CommandLineTool.from_cwl(cwl)
        except TypeError:
            if expand_workflow:
                return self.add_workflow(commit,
                                         path,
                                         file_key=file_key,
                                         cwl=cwl)
            tool = Workflow.from_cwl(cwl)

        tool_key = self.add_node(commit, path, tool=tool)

        if is_step:
            return tool_key

        for input_path, input_id in self.iter_file_inputs(
                tool, os.path.dirname(path)):
            input_key = self.add_file(input_path,
                                      revision='{0}^'.format(commit))
            #: Edge from an input to the tool.
            self.G.add_edge(input_key, tool_key, id=input_id)

        if file_key:
            _, path = file_key
            output_id = tool.get_output_id(path)
            if output_id:
                self.G.add_edge(tool_key, file_key, id=output_id)

        return tool_key
Beispiel #5
0
    def add_tool(self,
                 commit,
                 path,
                 file_key=None,
                 expand_workflow=True,
                 is_step=False):
        """Add a tool and its dependencies to the graph."""
        data = (commit.tree / path).data_stream.read()
        cwl = yaml.load(data)

        try:
            tool = CommandLineTool.from_cwl(cwl)
        except TypeError:
            if expand_workflow:
                return self.add_workflow(commit,
                                         path,
                                         file_key=file_key,
                                         cwl=cwl)
            tool = Workflow.from_cwl(cwl)

        tool_key = self.add_node(commit, path, tool=tool)

        if is_step:
            return tool_key

        for input_id, input_path in self.iter_input_files(
                tool, os.path.dirname(path)):
            input_key = self.add_file(input_path,
                                      revision='{0}^'.format(commit))
            #: Edge from an input to the tool.
            self.G.add_edge(input_key, tool_key, id=input_id)

        # Find ALL siblings that MUST be generated in the same commit.
        for output_id, path in self.iter_output_files(tool):
            self.G.add_edge(tool_key, (str(commit), path), id=output_id)

        return tool_key
Beispiel #6
0
    def ascwl(
        self,
        input_paths=None,
        output_paths=None,
        outputs=None,
        use_latest=True,
    ):
        """Serialize graph to CWL workflow.

        :param global_step_outputs: Make all step outputs global.
        """
        if output_paths is None:
            output_paths = {
                node.path
                for node in outputs if _safe_path(node.path)
            }

        workflow = Workflow()

        processes = set()
        stack = []

        output_keys = {(node.commit, node.path) for node in outputs}
        nodes = {(node.commit, node.path): node for node in self.nodes}

        def connect_file_to_directory(node):
            """Return step connecting file to a directory."""
            process = attr.evolve(
                LINK_CWL,
                inputs={
                    'input_directory': 'Directory',
                    'filename': {
                        'type':
                        'string',
                        'default':
                        str(Path(node.path).relative_to(node.parent.path)),
                    },
                })
            process_run = ProcessRun(
                commit=node.commit,
                client=node.client,
                path=None,
                process=process,
                inputs={
                    node.parent.path:
                    Usage(
                        entity=node.parent,
                        role='input_directory',
                    ),
                },
                outputs={
                    node.path: 'output_file',
                },
            )

            for generated in process_run.generated:
                nodes[(generated.commit, generated.path)] = generated

            return process_run

        for node in self.nodes:
            if (node.commit, node.path) not in output_keys:
                continue

            process_run = None
            if isinstance(node, Entity) and not hasattr(node, 'activity'):
                process_run = connect_file_to_directory(node)

                stack.append(process_run)
                processes.add(process_run)

            else:
                assert hasattr(node, 'activity'), node
                assert isinstance(node.activity, ProcessRun)

                plan = node.activity.association.plan
                latest = self.latest(plan)
                if use_latest and latest:
                    plan = nodes[(latest, plan.path)]

                process_run = plan.activity

                if process_run not in processes:
                    stack.append(process_run)
                    processes.add(process_run)

        while stack:
            action = stack.pop()

            if not hasattr(action, 'inputs'):
                continue

            for path, dependency in action.inputs.items():
                # Do not follow defined input paths.
                if input_paths and path in input_paths:
                    continue

                node = nodes.get((dependency.commit, dependency.path),
                                 dependency)

                if isinstance(node, Generation):
                    process_run = node.activity
                elif isinstance(node, Collection) and node.parent:
                    raise NotImplementedError('Can not connect subdirectory')
                elif isinstance(node, Entity) and node.parent:
                    process_run = connect_file_to_directory(node)
                else:
                    process_run = None

                # Skip existing commits
                if process_run and isinstance(process_run, ProcessRun):
                    plan = process_run.association.plan
                    latest = self.latest(plan)
                    if process_run.path and use_latest and latest:
                        plan = nodes[(latest, plan.path)]

                    process_run = plan.activity

                    if process_run not in processes:
                        stack.append(process_run)
                        processes.add(process_run)

        steps = {
            tool: 'step_{0}'.format(tool_index)
            for tool_index, tool in enumerate(processes, 1)
        }

        def _source_name(commit, path):
            """Find source name for a node."""
            try:
                process_run = nodes[(commit, path)].activity
                output_id = process_run.outputs[path]
                return '{0}/{1}'.format(steps[process_run], output_id)
            except (KeyError, AttributeError):
                pass

        def _relative_default(client, default):
            """Evolve ``File`` or ``Directory`` path."""
            if isinstance(default, PATH_TYPES):
                path = (client.workflow_path / default.path).resolve()
                return attr.evolve(default, path=path)
            return default

        input_index = 1

        for action, step_id in steps.items():
            tool = action.process

            ins = {}
            for path, dependency in action.inputs.items():
                alias = _source_name(dependency.commit, path)
                if alias:
                    ins[dependency.role] = alias

            outs = list(set(action.outputs.values()))

            for generated in action.generated:
                if generated.entity.path not in output_paths:
                    output_paths.add(generated.entity.path)
                    outputs.add(generated.entity)

            for input_ in tool.inputs:
                input_mapping = ins.get(input_.id)
                if input_mapping is None:
                    input_id = 'input_{0}'.format(input_index)
                    workflow.inputs.append(
                        InputParameter(
                            id=input_id,
                            type=input_.type,
                            default=_relative_default(self.client,
                                                      input_.default),
                        ))
                    input_index += 1
                    ins[input_.id] = input_id

            workflow.add_step(
                run=self.client.path / action.path if action.path else tool,
                id=step_id,
                in_=ins,
                out=outs,
            )

        for index, node in enumerate(
            (node for node in outputs if node.path in output_paths)):
            commit, path = node.commit, node.path
            id_ = 'output_{0}'.format(index)
            process_run = nodes[(commit, path)].activity

            if process_run.process is None or process_run.path is None:
                continue

            output_id = process_run.outputs[path]
            type_ = next(output for output in process_run.process.outputs
                         if output.id == output_id).type
            type_ = type_ if type_ == 'Directory' else 'File'
            output_source = _source_name(commit, path)

            if output_source is None:
                continue

            workflow.outputs.append(
                WorkflowOutputParameter(
                    id=id_,
                    type=type_,
                    outputSource=output_source,
                ))

        return workflow
Beispiel #7
0
    def ascwl(self, global_step_outputs=False):
        """Serialize graph to CWL workflow.

        :param global_step_outputs: Make all step outputs global.
        """
        workflow = Workflow()

        input_index = 1
        steps = {}

        def _source_name(key):
            """Find source name for a node."""
            if self.G.in_degree(key) == 0:
                return None

            assert self.G.in_degree(key) == 1

            tool_key, node = list(self.G.pred[key].items())[0]
            return '{0}/{1}'.format(steps[tool_key], node['id'])

        def _relative_default(client, default):
            """Evolve ``File`` path."""
            if isinstance(default, File):
                path = (client.workflow_path / default.path).resolve()
                return attr.evolve(default, path=path)
            return default

        for tool_index, (key, node) in enumerate(self._tool_nodes, 1):
            _, path = key
            tool = node['tool']
            step_id = 'step_{0}'.format(tool_index)
            steps[key] = step_id

            ins = {
                edge_id: _source_name(target_id)
                for target_id, _, edge_id in self.G.in_edges(key, data='id')
            }
            outs = [
                edge_id for _, _, edge_id in self.G.out_edges(key, data='id')
            ]

            for input_ in tool.inputs:
                input_mapping = ins.get(input_.id)
                if input_mapping is None:
                    input_id = 'input_{0}'.format(input_index)
                    workflow.inputs.append(
                        InputParameter(
                            id=input_id,
                            type=input_.type,
                            default=_relative_default(self.client,
                                                      input_.default),
                        ))
                    input_index += 1
                    ins[input_.id] = input_id

            workflow.add_step(
                run=self.client.path / path,
                id=step_id,
                in_=ins,
                out=outs,
            )

        output_keys = (key for _, key in self.G.out_edges(steps.keys())
                       ) if global_step_outputs else self._output_keys

        for index, key in enumerate(output_keys):
            output_id = 'output_{0}'.format(index)
            workflow.outputs.append(
                WorkflowOutputParameter(
                    id=output_id,
                    type='File',
                    outputSource=_source_name(key),
                ))

        return workflow
Beispiel #8
0
    def add_workflow(self, commit, path, cwl=None, file_key=None):
        """Add a workflow and its dependencies to the graph."""
        if cwl is None:
            data = (commit.tree / path).data_stream.read()
            cwl = yaml.load(data)

        workflow = Workflow.from_cwl(cwl)
        basedir = os.path.dirname(path)

        # Keep track of node identifiers for steps, inputs and outputs:
        step_map = {}
        input_map = {}
        output_map = {}

        #: First find workflow inputs, but don't connect them yet.
        for input_id, input_path in self.iter_input_files(workflow, basedir):
            input_key = self.add_file(input_path,
                                      revision='{0}^'.format(commit))
            input_map[input_id] = input_key

        for step in workflow.steps:
            tool_key = self.add_tool(
                commit,
                os.path.join(basedir, step.run),
                file_key=file_key,
                is_step=True,
            )

            step_tool = self.G.nodes[tool_key]['tool']

            for input_id, input_path in self.iter_input_files(
                    step_tool, basedir):
                if input_path in commit.stats.files:
                    #: Check intermediate committed files
                    input_key = self.add_node(commit, input_path)
                    #: Edge from an input to the tool.
                    self.G.add_edge(input_key, tool_key, id=input_id)
                else:
                    #: Global workflow input
                    source = step.in_[input_id]
                    self.G.add_edge(input_map[source], tool_key, id=input_id)

            # Find ALL siblings that MUST be generated in the same commit.
            for output_id, output_path in self.iter_output_files(step_tool):
                self.G.add_edge(tool_key, (str(commit), output_path),
                                id=output_id)

            output_map.update({
                step.id + '/' + name: target
                for target, _, name in self.G.in_edges(tool_key, data='id')
            })
            step_map[step.id] = tool_key

            self.G.nodes[tool_key]['workflow'] = workflow
            self.G.nodes[tool_key][
                'workflow_path'] = path + '#steps/' + step.id

        for step in workflow.steps:
            for alias, source in step.in_.items():
                name = step.id + '/' + alias

                if name in output_map and '/' in source:
                    other_step, id_ = source.split('/')
                    other_key = step_map[other_step]
                    self.G.add_edge(other_key, output_map[name], id=id_)

        return workflow
Beispiel #9
0
    def ascwl(
        self,
        input_paths=None,
        output_paths=None,
        outputs=None,
        use_latest=True,
    ):
        """Serialize graph to CWL workflow.

        :param global_step_outputs: Make all step outputs global.
        """
        if output_paths is None:
            output_paths = {
                node.path
                for node in outputs if _safe_path(node.path)
            }

        workflow = Workflow()

        processes = set()
        stack = []

        output_keys = {(node.commit, node.path) for node in outputs}
        nodes = {(node.commit, node.path): node for node in self.nodes}

        for node in self.nodes:
            if (node.commit, node.path) not in output_keys:
                continue

            process_run = None
            if isinstance(node, ProcessRun):
                process_run = node
            elif isinstance(node.activity, ProcessRun):
                process_run = node.activity

            if process_run:
                latest = self.latest(process_run)
                if use_latest and latest:
                    process_run = nodes[(latest, process_run.path)]

                if process_run not in processes:
                    stack.append(process_run)
                    processes.add(process_run)

        while stack:
            action = stack.pop()

            if not hasattr(action, 'inputs'):
                continue

            for path, dependency in action.inputs.items():
                # Do not follow defined input paths.
                if input_paths and path in input_paths:
                    continue

                try:
                    process_run = nodes[(dependency.commit,
                                         dependency.path)].activity
                except AttributeError:
                    continue

                # Skip existing commits
                if process_run and isinstance(process_run, ProcessRun):
                    latest = self.latest(process_run)
                    if use_latest and latest:
                        process_run = nodes[(latest, process_run.path)]

                    if process_run not in processes:
                        stack.append(process_run)
                        processes.add(process_run)

        steps = {
            tool: 'step_{0}'.format(tool_index)
            for tool_index, tool in enumerate(processes, 1)
        }

        def _source_name(commit, path):
            """Find source name for a node."""
            try:
                process_run = nodes[(commit, path)].activity
                output_id = process_run.outputs[path]
                return '{0}/{1}'.format(steps[process_run], output_id)
            except (KeyError, AttributeError):
                pass

        def _relative_default(client, default):
            """Evolve ``File`` or ``Directory`` path."""
            if isinstance(default, PATH_TYPES):
                path = (client.workflow_path / default.path).resolve()
                return attr.evolve(default, path=path)
            return default

        input_index = 1

        for action, step_id in steps.items():
            tool = action.process

            ins = {}
            for path, dependency in action.inputs.items():
                alias = _source_name(dependency.commit, path)
                if alias:
                    ins[dependency.role] = alias

            outs = list(set(action.outputs.values()))

            for generated in action.generated:
                if generated.entity.path not in output_paths:
                    output_paths.add(generated.entity.path)
                    outputs.add(generated.entity)

            for input_ in tool.inputs:
                input_mapping = ins.get(input_.id)
                if input_mapping is None:
                    input_id = 'input_{0}'.format(input_index)
                    workflow.inputs.append(
                        InputParameter(
                            id=input_id,
                            type=input_.type,
                            default=_relative_default(self.client,
                                                      input_.default),
                        ))
                    input_index += 1
                    ins[input_.id] = input_id

            workflow.add_step(
                run=self.client.path / action.path,
                id=step_id,
                in_=ins,
                out=outs,
            )

        for index, node in enumerate(
            (node for node in outputs if node.path in output_paths)):
            commit, path = node.commit, node.path
            id_ = 'output_{0}'.format(index)
            process_run = nodes[(commit, path)].activity

            if process_run.process is None:
                continue

            output_id = process_run.outputs[path]
            type_ = next(output for output in process_run.process.outputs
                         if output.id == output_id).type
            type_ = type_ if type_ == 'Directory' else 'File'
            output_source = _source_name(commit, path)

            if output_source is None:
                continue

            workflow.outputs.append(
                WorkflowOutputParameter(
                    id=id_,
                    type=type_,
                    outputSource=output_source,
                ))

        return workflow