Beispiel #1
0
class Generation(EntityProxyMixin):
    """Represent an act of generating a file."""

    entity = jsonld.ib(context={
        '@reverse': 'prov:qualifiedGeneration',
    },
                       type=[
                           'renku.core.models.entities.Entity',
                           'renku.core.models.entities.Collection', Dataset,
                           DatasetFile
                       ])

    role = jsonld.ib(context='prov:hadRole', default=None)

    _activity = attr.ib(
        default=None,
        kw_only=True,
        converter=lambda value: weakref.ref(value)
        if value is not None else None,
    )
    _id = jsonld.ib(context='@id', kw_only=True)

    @property
    def activity(self):
        """Return the activity object."""
        return self._activity() if self._activity is not None else None

    @_id.default
    def default_id(self):
        """Configure calculated ID."""
        if self.role:
            return '{self.activity._id}/outputs/{self.role}'.format(
                self=self, )
        return '{self.activity._id}/tree/{self.entity.path}'.format(
            self=self, )
class MappedIOStream(object):
    """Represents an IO stream (stdin, stdout, stderr)."""

    _id = jsonld.ib(context='@id', kw_only=True)
    _label = jsonld.ib(default=None, context='rdfs:label', kw_only=True)

    STREAMS = ['stdin', 'stdout', 'stderr']

    stream_type = jsonld.ib(
        context={
            '@id': 'renku:streamType',
            '@type': 'http://www.w3.org/2001/XMLSchema#string',
        },
        type=str,
        kw_only=True,
    )

    @_id.default
    def default_id(self):
        """Set default id."""
        # TODO: make bnode ids nicer once this issue is in a release:
        # https://github.com/RDFLib/rdflib/issues/888
        # right now it's limited to a-zA-Z0-9 (-_ will work once it's fixed)
        return '_:MappedIOStream-{}'.format(str(uuid.uuid4())).replace('-', '')

    def default_label(self):
        """Set default label."""
        return 'Stream mapping for stream "{}"'.format(self.stream_type)

    def __attrs_post_init__(self):
        """Post-init hook."""
        if not self._label:
            self._label = self.default_label()
Beispiel #3
0
class Association:
    """Assign responsibility to an agent for an activity."""

    plan = jsonld.ib(context='prov:hadPlan',
                     type='renku.core.models.provenance.processes.Process')
    agent = jsonld.ib(context='prov:agent',
                      default=None,
                      type='renku.core.models.provenance.agents.SoftwareAgent')

    _id = jsonld.ib(context='@id', kw_only=True)

    @classmethod
    def from_activity(cls, activity, commit=None):
        """Create an instance from the activity."""
        from .agents import SoftwareAgent

        agent = SoftwareAgent.from_commit(activity.commit)
        return cls(
            plan=activity.__association_cls__(
                commit=commit or activity.commit,
                client=activity.client,
                path=activity.path,
                activity=activity,
            ),
            agent=agent,
            id=activity._id + '/association',  # add plan and agent
        )
class CommandParameter(object):
    """Represents a parameter for an execution template."""

    _id = jsonld.ib(default=None, context='@id', kw_only=True)
    _label = jsonld.ib(default=None, context='rdfs:label', kw_only=True)

    position = jsonld.ib(
        default=None,
        context={
            '@id': 'renku:position',
            '@type': 'http://www.w3.org/2001/XMLSchema#integer',
        },
        type=int,
        kw_only=True,
    )

    prefix = jsonld.ib(
        default=None,
        context={
            '@id': 'renku:prefix',
            '@type': 'http://www.w3.org/2001/XMLSchema#string',
        },
        type=str,
        kw_only=True,
    )

    @property
    def sanitized_id(self):
        """Return ``_id`` sanitized for use in non-jsonld contexts."""
        return self._id.split(':', 1)[1].replace('-', '_')
class Annotation:
    """Represents a custom annotation for a research object."""

    _id = jsonld.ib(context='@id', kw_only=True)

    body = jsonld.ib(default=None, context='oa:hasBody', kw_only=True)

    source = jsonld.ib(default=None, context='dcterms:creator', kw_only=True)
Beispiel #6
0
class CommitMixin:
    """Represent a commit mixin."""

    commit = attr.ib(default=None, kw_only=True)
    client = attr.ib(default=None, kw_only=True)
    path = jsonld.ib(
        context='prov:atLocation',
        default=None,
        kw_only=True,
        converter=_str_or_none
    )

    _id = jsonld.ib(context='@id', kw_only=True)
    _label = jsonld.ib(context='rdfs:label', kw_only=True)
    _project = jsonld.ib(
        context='schema:isPartOf', type=Project, kw_only=True, default=None
    )

    @property
    def submodules(self):
        """Proxy to client submodules."""
        return self.client.submodules

    @_id.default
    def default_id(self):
        """Configure calculated ID."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = 'UNCOMMITTED'
        return 'file://blob/{hexsha}/{self.path}'.format(
            hexsha=hexsha, self=self
        )

    @_label.default
    def default_label(self):
        """Generate a default label."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = 'UNCOMMITTED'
        if self.path:
            return '{self.path}@{hexsha}'.format(hexsha=hexsha, self=self)
        return '{hexsha}'.format(hexsha=hexsha, self=self)

    def __attrs_post_init__(self):
        """Post-init hook."""
        if self.path:
            path = Path(self.path)
            if path.is_absolute():
                self.path = str(path.relative_to(self.client.path))

        # always force "project" to be the current project
        if self.client:
            self._project = self.client.project
class CommandOutput(CommandParameter):
    """An output of a command."""

    create_folder = jsonld.ib(default=False,
                              context='renku:createFolder',
                              kw_only=True,
                              type=bool)

    produces = jsonld.ib(context='renku:produces',
                         kw_only=True,
                         type=[
                             'renku.core.models.entities.Entity',
                             'renku.core.models.entities.Collection'
                         ])

    mapped_to = jsonld.ib(default=None,
                          context='prov:mappedTo',
                          kw_only=True,
                          type=MappedIOStream)

    def default_id(self):
        """Set default id."""
        return '_:CommandOutput-{}'.format(str(uuid.uuid4())).replace('-', '')

    def default_label(self):
        """Set default label."""
        return 'Command Output "{}"'.format(self.produces.path)

    def to_argv(self):
        """String representation (sames as cmd argument)."""
        if self.prefix:
            if self.prefix.endswith(' '):
                return [self.prefix[:-1], self.produces.path]
            return ['{}{}'.format(self.prefix, self.produces.path)]

        return [self.produces.path]

    def to_stream_repr(self):
        """Input stream representation."""
        if not self.mapped_to:
            return ''

        if self.mapped_to.stream_type == 'stdout':
            return ' > {}'.format(self.produces.path)

        return ' 2> {}'.format(self.produces.path)

    def __attrs_post_init__(self):
        """Post-init hook."""
        if not self._id:
            self._id = self.default_id()

        if not self._label:
            self._label = self.default_label()
Beispiel #8
0
class Collection(Entity):
    """Represent a directory with files."""

    members = jsonld.ib(context='prov:hadMember', kw_only=True)

    @members.default
    def default_members(self):
        """Generate default members as entities from current path."""
        dir_path = self.client.path / self.path
        assert dir_path.is_dir()

        members = []
        for path in dir_path.iterdir():
            if path.name == '.gitkeep':
                continue  # ignore empty directories in Git repository
            cls = Collection if path.is_dir() else Entity
            members.append(
                cls(
                    commit=self.commit,
                    client=self.client,
                    path=str(path.relative_to(self.client.path)),
                    parent=self,
                )
            )
        return members

    @property
    def entities(self):
        """Recursively return all files."""
        for member in self.members:
            yield from member.entities
        yield self
Beispiel #9
0
class Workflow(Process):
    """Represent workflow with subprocesses."""

    subprocesses = jsonld.ib(context='wfdesc:hasSubProcess', kw_only=True)

    @subprocesses.default
    def default_subprocesses(self):
        """Load subprocesses."""
        return [
            subprocess.association.plan
            for subprocess in self.activity.subprocesses.values()
        ]
Beispiel #10
0
class Usage(EntityProxyMixin):
    """Represent a dependent path."""

    entity = jsonld.ib(context='prov:entity',
                       kw_only=True,
                       type=[
                           'renku.core.models.entities.Entity',
                           'renku.core.models.entities.Collection', Dataset,
                           DatasetFile
                       ])
    role = jsonld.ib(context='prov:hadRole', default=None, kw_only=True)

    _id = jsonld.ib(context='@id', default=None, kw_only=True)

    @classmethod
    def from_revision(cls, client, path, revision='HEAD', **kwargs):
        """Return dependency from given path and revision."""
        from renku.core.models.entities import Entity

        return cls(entity=Entity.from_revision(client, path, revision),
                   **kwargs)
class Process(CommitMixin):
    """Represent a process."""

    _activity = jsonld.ib(
        context='prov:activity',
        kw_only=True,
        converter=weakref.ref,
        type='renku.core.models.provenance.activities.Activity')

    @property
    def activity(self):
        """Return the activity object."""
        return self._activity()
Beispiel #12
0
class SoftwareAgent:
    """Represent executed software."""

    label = jsonld.ib(context='rdfs:label', kw_only=True)
    was_started_by = jsonld.ib(
        context='prov:wasStartedBy',
        default=None,
        kw_only=True,
    )

    _id = jsonld.ib(context='@id', kw_only=True)

    @classmethod
    def from_commit(cls, commit):
        """Create an instance from a Git commit."""
        author = Person.from_commit(commit)
        if commit.author != commit.committer:
            return cls(
                label=commit.committer.name,
                id=commit.committer.email,
                was_started_by=author,
            )
        return author
Beispiel #13
0
class Process(CommitMixin):
    """Represent a process."""

    _activity = jsonld.ib(
        default=None,
        context='prov:activity',
        kw_only=True,
        converter=lambda value: weakref.ref(value)
        if value is not None else None,
        type='renku.core.models.provenance.activities.Activity')

    @property
    def activity(self):
        """Return the activity object."""
        return self._activity()
class CommandArgument(CommandParameter):
    """An argument to a command that is neither input nor output."""

    value = jsonld.ib(
        default=None,
        context={
            '@id': 'renku:value',
            '@type': 'http://www.w3.org/2001/XMLSchema#string',
        },
        type=str,
        kw_only=True,
    )

    def default_id(self):
        """Set default id."""
        return '_:CommandArgument-{}'.format(str(uuid.uuid4())).replace(
            '-', '')

    def default_label(self):
        """Set default label."""
        return 'Command Argument "{}"'.format(self.value)

    def to_argv(self):
        """String representation (sames as cmd argument)."""
        if self.prefix:
            if self.prefix.endswith(' '):
                return [self.prefix[:-1], self.value]
            return ['{}{}'.format(self.prefix, self.value)]

        return [self.value]

    def __attrs_post_init__(self):
        """Post-init hook."""
        if not self._id:
            self._id = self.default_id()

        if not self._label:
            self._label = self.default_label()
class WorkflowRun(ProcessRun):
    """A workflow run typically contains several subprocesses."""

    __association_cls__ = Workflow

    # @reverse wfprov:wasPartOfWorkflowRun

    children = attr.ib(kw_only=True)

    _processes = jsonld.ib(context={
        '@reverse': 'wfprov:wasPartOfWorkflowRun',
    },
                           default=attr.Factory(list),
                           kw_only=True,
                           type=Process)
    subprocesses = attr.ib(default=None, kw_only=True)

    outputs = attr.ib(default=None, kw_only=True)

    generated = jsonld.container.list(Generation,
                                      context={
                                          '@reverse': 'prov:activity',
                                      },
                                      kw_only=True)

    @children.default
    def default_children(self):
        """Load children from process."""
        basedir = os.path.dirname(self.path) if self.path is not None else None

        def _load(step):
            """Load step definition."""
            if isinstance(step.run, WORKFLOW_STEP_RUN_TYPES):
                return step.run

            if self.commit:
                import yaml
                data = (self.commit.tree / basedir /
                        step.run).data_stream.read()
                return CWLClass.from_cwl(yaml.safe_load(data))

            return CWLClass.from_yaml(step.run)

        return {step.id: _load(step) for step in self.process.steps}

    def default_subprocesses(self):
        """Load subprocesses."""
        basedir = os.path.dirname(self.path)
        revision = '{0}^'.format(self.commit)

        ins = {
            dependency.role: dependency
            for path, dependency in self.inputs.items()
            if isinstance(dependency, Usage)
        }

        entities = {}
        outs = {}
        subprocesses = {}

        for step in reversed(self.process.topological_steps):
            if isinstance(step.run, WORKFLOW_STEP_RUN_TYPES):
                path = None
                process = step.run
            else:
                path = os.path.join(basedir, step.run)
                process = self.children[step.id]

            subprocess_id = self._id + '/steps/' + step.id

            inputs = {}
            for alias, source in step.in_.items():
                usage_id = subprocess_id + '/inputs/' + alias
                if source in ins:
                    dependency = ins[source]
                    inputs[dependency.path] = attr.evolve(
                        dependency,
                        role=alias,
                        id=usage_id,
                    )
                elif source in outs:
                    input_path = outs[source]
                    inputs[input_path] = Usage(
                        entity=entities[input_path],
                        role=alias,
                        id=usage_id,
                    )
                else:
                    # TODO check that it is not Path or Directory
                    pass

            subprocess_entity_commit = self.client.find_previous_commit(
                path, revision=revision)
            subprocess = process.create_run(
                commit=self.commit,
                client=self.client,
                part_of=self,
                process=process,
                path=path,
                inputs=inputs,
                id=subprocess_id,
            )

            subprocess.association = Association.from_activity(
                subprocess,
                commit=subprocess_entity_commit,
            )

            for output_path, source in subprocess.outputs.items():
                outs.setdefault(step.id + '/' + source, output_path)

            for generation in subprocess.generated:
                entity = generation.entity
                entities[entity.path] = entity

                if isinstance(entity, Collection):
                    entities.update(
                        **{member.path: member
                           for member in entity.members})

            subprocesses[step.id] = subprocess
            self._processes.append(subprocess)

        return subprocesses

    def iter_output_files(self, commit=None):
        """Yield tuples with output id and path."""
        commit = commit or self.commit

        tools = self.default_children()
        setattr(self, 'children', tools)

        for output in self.process.outputs:
            if output.type not in PATH_OBJECTS:
                continue

            if output.outputSource:
                step_id, _, source = output.outputSource.partition('/')
                subprocess = self.subprocesses[step_id]
                for glob, output_id in subprocess.outputs.items():
                    if output.id == output_id:
                        yield output.id, glob
                        break
            elif output.outputBinding:
                glob = output.outputBinding.glob
                # TODO better support for Expression
                if glob.startswith('$(inputs.'):
                    input_id = glob[len('$(inputs.'):-1]
                    for input_ in self.inputs:
                        if input_.id == input_id:
                            yield output.id, input_.default
                else:
                    yield output.id, glob

    def default_generated(self):
        """Calculate default values."""
        results = []
        for output in self.process.outputs:
            step_id, _, source = output.outputSource.partition('/')
            assert step_id in self.children

            for generated in self.subprocesses[step_id].generated:
                if generated.role == source:
                    results.append(
                        attr.evolve(
                            generated,
                            role=output.id,
                            activity=self,
                        ))
                    break
            else:
                raise KeyError(output)

        return results

    @property
    def nodes(self):
        """Yield all graph nodes."""
        for subprocess in reversed(self._processes):
            if subprocess.path is None:
                # skip nodes connecting directory to file
                continue
            yield from subprocess.nodes

    def __attrs_post_init__(self):
        """Attrs post initializations."""
        if not self._id:
            self._id = self.default_id()
        if not self.inputs:
            self.inputs = self.default_inputs()
        if not self.subprocesses:
            self.subprocesses = self.default_subprocesses()
        if not self.generated:
            self.generated = self.default_generated()

        super().__attrs_post_init__()
Beispiel #16
0
class Project(ReferenceMixin):
    """Represent a project."""

    name = jsonld.ib(default=None, context='schema:name')

    created = jsonld.ib(converter=parse_date, context='schema:dateCreated')

    updated = jsonld.ib(converter=parse_date, context='schema:dateUpdated')

    version = jsonld.ib(converter=str,
                        default=str(SUPPORTED_PROJECT_VERSION),
                        context='schema:schemaVersion')

    client = attr.ib(default=None, kw_only=True)

    creator = jsonld.ib(default=None,
                        kw_only=True,
                        context={
                            '@id': 'schema:creator',
                        },
                        type=Person)

    _id = jsonld.ib(context='@id', kw_only=True, default=None)

    @created.default
    @updated.default
    def _now(self):
        """Define default value for datetime fields."""
        return datetime.datetime.now(datetime.timezone.utc)

    def __attrs_post_init__(self):
        """Initialize computed attributes."""
        if not self.creator and self.client:
            if self.client.renku_metadata_path.exists():
                self.creator = Person.from_commit(
                    self.client.find_previous_commit(
                        self.client.renku_metadata_path, return_first=True), )
            else:
                # this assumes the project is being newly created
                self.creator = Person.from_git(self.client.repo)

        try:
            self._id = self.project_id
        except ValueError:
            """Fallback to old behaviour."""
            if self._id:
                pass
            elif self.client and self.client.is_project_set():
                self._id = self.client.project._id
            else:
                raise

    @property
    def project_id(self):
        """Return the id for the project based on the repo origin remote."""
        import pathlib
        import urllib

        # Determine the hostname for the resource URIs.
        # If RENKU_DOMAIN is set, it overrides the host from remote.
        # Default is localhost.
        host = 'localhost'

        if not self.creator:
            raise ValueError('Project Creator not set')

        owner = self.creator.email.split('@')[0]
        name = self.name

        if self.client:
            remote = self.client.remote
            host = self.client.remote.get('host') or host
            owner = remote.get('owner') or owner
            name = remote.get('name') or name
        host = os.environ.get('RENKU_DOMAIN') or host
        if name:
            name = urllib.parse.quote(name, safe='')
        else:
            raise ValueError('Project name not set')

        project_url = urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            pathlib.posixpath.join(PROJECT_URL_PATH, owner, name or 'NULL'))
        return project_url

    @classmethod
    def from_yaml(cls, path, client=None):
        """Return an instance from a YAML file."""
        data = jsonld.read_yaml(path)

        self = cls.from_jsonld(data=data, client=client)
        self.__reference__ = path

        return self

    @classmethod
    def from_jsonld(cls, data, client=None):
        """Create an instance from JSON-LD data."""
        if isinstance(data, cls):
            return data
        if not isinstance(data, dict):
            raise ValueError(data)

        return ProjectSchema(client=client).load(data)

    def to_yaml(self):
        """Write an instance to the referenced YAML file."""
        data = ProjectSchema().dump(self)
        jsonld.write_yaml(path=self.__reference__, data=data)

    def as_jsonld(self):
        """Create JSON-LD."""
        return ProjectSchema().dump(self)
Beispiel #17
0
class Run(CommitMixin):
    """Represents a `renku run` execution template."""

    command = jsonld.ib(
        default=None,
        context={
            '@id': 'renku:command',
            '@type': 'http://www.w3.org/2001/XMLSchema#string',
        },
        type=str,
        kw_only=True,
    )

    process_order = jsonld.ib(
        default=None,
        context={
            '@id': 'renku:processOrder',
            '@type': 'http://www.w3.org/2001/XMLSchema#integer',
        },
        type=int,
        kw_only=True,
    )

    successcodes = jsonld.container.list(context='renku:successCodes',
                                         kw_only=True,
                                         type=int)

    subprocesses = jsonld.container.list('renku.core.models.workflow.run.Run',
                                         context='renku:hasSubprocess',
                                         kw_only=True)

    arguments = jsonld.container.list(context='renku:hasArguments',
                                      kw_only=True,
                                      type=CommandArgument)

    inputs = jsonld.container.list(context='renku:hasInputs',
                                   kw_only=True,
                                   type=CommandInput)

    outputs = jsonld.container.list(context='renku:hasOutputs',
                                    kw_only=True,
                                    type=CommandOutput)

    @classmethod
    def from_factory(cls, factory, client, commit, path):
        """Creates a ``Run`` from a ``CommandLineToolFactory``."""
        inputs = []
        arguments = []
        outputs = [
            _convert_cmd_output(o, factory, client, commit)
            for o in factory.outputs
        ]  # TODO: handle stream!

        if outputs:
            outputs, inputs_to_remove = zip(*outputs)
            outputs = list(outputs)

            for i in inputs_to_remove:
                # remove inputs that are actually outputs
                # note: a single input can represent multiple outputs
                # in case of repetition in the cli
                if not i:
                    continue
                if i in factory.inputs:
                    factory.inputs.remove(i)

        for i in factory.inputs:
            res = _convert_cmd_input(i, client, commit)

            if isinstance(res, CommandInput):
                inputs.append(res)
            else:
                arguments.append(res)

        return cls(
            client=client,
            commit=commit,
            path=path,
            command=' '.join(factory.baseCommand),
            successcodes=factory.successCodes,
            arguments=[_convert_cmd_binding(a)
                       for a in factory.arguments] + arguments,
            inputs=inputs,
            outputs=outputs)

    @property
    def activity(self):
        """Return the activity object."""
        return self._activity()

    def to_argv(self):
        """Convert run into argv list."""
        argv = []

        if self.command:
            argv.extend(self.command.split(' '))

        arguments = self.inputs + self.outputs + self.arguments

        arguments = filter(lambda x: x.position, arguments)
        arguments = sorted(arguments, key=lambda x: x.position)
        argv.extend(e for a in arguments for e in a.to_argv())

        return argv

    def to_stream_repr(self):
        """Input/output stream representation."""
        stream_repr = []

        for input_ in self.inputs:
            if input_.mapped_to:
                stream_repr.append(input_.to_stream_repr())

        for output in self.outputs:
            if output.mapped_to:
                stream_repr.append(output.to_stream_repr())
        return stream_repr

    def update_id_and_label_from_commit_path(self, client, commit, path):
        """Updates the _id and _label using supplied commit and path."""
        self.client = client

        if not self.commit:
            self.commit = commit

            path = Path(os.path.abspath(path)).relative_to(self.client.path)
            self.path = path
            self._id = self.default_id()
            self._label = self.default_label()

        if len(self.subprocesses) > 0:
            for s in self.subprocesses:
                s.update_id_and_label_from_commit_path(client, commit, path)

    def add_subprocess(self, subprocess, process_order=None):
        """Adds a subprocess to this run."""
        if not process_order:
            process_order = 0
            if self.subprocesses:
                # sort subprocesses by dependencies
                process_order = bisect(self.subprocesses, subprocess)
                if process_order < len(self.subprocesses):
                    # inserted before end, recalculate orders or rest
                    for s in self.subprocesses:
                        if s.process_order >= process_order:
                            s.process_order += 1

        if any(s.process_order == process_order for s in self.subprocesses):
            raise ValueError(
                'process_order {} already exists'.format(process_order))

        subprocess.process_order = process_order

        input_paths = [i.consumes.path for i in self.inputs]
        output_paths = [o.produces.path for o in self.outputs]

        for input_ in subprocess.inputs:
            if (input_.consumes.path not in input_paths
                    and input_.consumes.path not in output_paths):
                new_input = copy(input_)
                new_input.mapped_to = None

                matching_output = next(
                    (o for o in self.outputs
                     if o.produces.path == new_input.consumes.path), None)

                if not matching_output:
                    self.inputs.append(new_input)
                    input_paths.append(new_input.consumes.path)

        for output in subprocess.outputs:
            if output.produces.path not in output_paths:
                new_output = copy(output)
                new_output.mapped_to = None
                self.outputs.append(new_output)
                output_paths.append(new_output.produces.path)

                matching_input = next(
                    (i for i in self.inputs
                     if i.consumes.path == new_output.produces.path), None)
                if matching_input:
                    self.inputs.remove(matching_input)
                    input_paths.remove(matching_input.consumes.path)

        self.subprocesses.append(subprocess)

        self.subprocesses = sorted(self.subprocesses,
                                   key=lambda s: s.process_order)

    def __lt__(self, other):
        """Compares two subprocesses order based on their dependencies."""
        a_inputs = set()
        b_outputs = set()

        for i in other.inputs:
            entity = i.consumes
            for subentity in entity.entities:
                a_inputs.add(subentity.path)

        for i in self.outputs:
            entity = i.produces
            for subentity in entity.entities:
                b_outputs.add(subentity.path)

        return a_inputs & b_outputs
class Activity(CommitMixin):
    """Represent an activity in the repository."""

    _id = jsonld.ib(default=None, context='@id', kw_only=True)
    _message = jsonld.ib(context='rdfs:comment', kw_only=True)
    _was_informed_by = jsonld.ib(
        context='prov:wasInformedBy',
        kw_only=True,
    )

    part_of = attr.ib(default=None, kw_only=True)

    _collections = attr.ib(
        default=attr.Factory(OrderedDict), init=False, kw_only=True
    )
    generated = jsonld.container.list(
        Generation, context={
            '@reverse': 'prov:activity',
        }, kw_only=True
    )

    invalidated = jsonld.container.list(
        Entity, context={
            '@reverse': 'prov:wasInvalidatedBy',
        }, kw_only=True
    )

    influenced = jsonld.ib(
        context='prov:influenced',
        kw_only=True,
    )

    started_at_time = jsonld.ib(
        context={
            '@id': 'prov:startedAtTime',
            '@type': 'http://www.w3.org/2001/XMLSchema#dateTime',
        },
        kw_only=True,
    )

    ended_at_time = jsonld.ib(
        context={
            '@id': 'prov:endedAtTime',
            '@type': 'http://www.w3.org/2001/XMLSchema#dateTime',
        },
        kw_only=True,
    )

    agent = jsonld.ib(
        context='prov:wasAssociatedWith',
        kw_only=True,
        default=renku_agent,
        type='renku.core.models.provenance.agents.SoftwareAgent'
    )
    person_agent = jsonld.ib(
        context='prov:wasAssociatedWith',
        kw_only=True,
        type='renku.core.models.provenance.agents.Person'
    )

    def default_generated(self):
        """Create default ``generated``."""
        generated = []

        for path in self.get_output_paths():
            entity = self._get_activity_entity(path)

            generated.append(
                Generation(activity=self, entity=entity, role=None)
            )
        return generated

    def get_output_paths(self):
        """Gets all output paths generated by this run."""
        index = set()

        commit = self.commit

        if not self.commit:
            if not self.client:
                return index
            commit = self.client.repo.head.commit

        for file_ in commit.diff(commit.parents or NULL_TREE):
            # ignore deleted files (note they appear as ADDED)
            # in this backwards diff
            if file_.change_type == 'A':
                continue
            path_ = Path(file_.a_path)

            is_dataset = self.client.DATASETS in str(path_)
            not_refs = LinkReference.REFS not in str(path_)
            does_not_exists = not path_.exists()

            if all([is_dataset, not_refs, does_not_exists]):
                uid = uuid.UUID(path_.parent.name)
                path_ = (
                    Path(self.client.renku_home) / self.client.DATASETS /
                    str(uid) / self.client.METADATA
                )

            index.add(str(path_))

        return index

    def _get_activity_entity(self, path, deleted=False):
        """Gets the entity associated with this Activity and path."""
        client, commit, path = self.client.resolve_in_submodules(
            self.commit,
            path,
        )
        output_path = client.path / path
        parents = list(output_path.relative_to(client.path).parents)

        collection = None
        members = []
        for parent in reversed(parents[:-1]):
            if str(parent) in self._collections:
                collection = self._collections[str(parent)]
            else:
                collection = Collection(
                    client=client,
                    commit=commit,
                    path=str(parent),
                    members=[],
                    parent=collection,
                )
                members.append(collection)
                self._collections[str(parent)] = collection

            members = collection.members

        entity_cls = Entity
        if (self.client.path / path).is_dir():
            entity_cls = Collection

        # TODO: use a factory method to generate the entity
        if str(path).startswith(
            os.path.join(client.renku_home, client.DATASETS)
        ) and not deleted:
            entity = client.load_dataset_from_path(path, commit=commit)
        else:
            entity = entity_cls(
                commit=commit,
                client=client,
                path=str(path),
                parent=collection,
            )

        if collection:
            collection.members.append(entity)

        return entity

    def default_invalidated(self):
        """Entities invalidated by this Action."""
        results = []
        for path in self.removed_paths:
            entity = self._get_activity_entity(path, deleted=True)

            results.append(entity)
        return results

    @influenced.default
    def default_influenced(self):
        """Calculate default values."""
        return list(self._collections.values())

    @property
    def parents(self):
        """Return parent commits."""
        if self.commit:
            return list(self.commit.parents)

    @property
    def removed_paths(self):
        """Return all paths removed in the commit."""
        index = set()
        if not self.commit:
            return index

        for file_ in self.commit.diff(self.commit.parents or NULL_TREE):
            # only process deleted files (note they appear as ADDED)
            # in this backwards diff
            if file_.change_type != 'A':
                continue
            path_ = Path(file_.a_path)

            index.add(str(path_))

        return index

    @property
    def paths(self):
        """Return all paths in the commit."""
        index = set()

        for file_ in self.commit.diff(self.commit.parents or NULL_TREE):
            # ignore deleted files (note they appear as ADDED)
            # in this backwards diff
            if file_.change_type == 'A':
                continue
            path_ = Path(file_.a_path)

            is_dataset = self.client.DATASETS in str(path_)
            not_refs = LinkReference.REFS not in str(path_)
            does_not_exists = not (
                path_.exists() or
                (path_.is_symlink() and os.path.lexists(path_))
            )

            if all([is_dataset, not_refs, does_not_exists]):
                uid = uuid.UUID(path_.parent.name)
                path_ = (
                    Path(self.client.renku_home) / self.client.DATASETS /
                    str(uid) / self.client.METADATA
                )

            index.add(str(path_))

        return index

    @classmethod
    def generate_id(cls, commitsha):
        """Calculate action ID."""
        host = 'localhost'
        if hasattr(cls, 'client'):
            host = cls.client.remote.get('host') or host
        host = os.environ.get('RENKU_DOMAIN') or 'localhost'

        # always set the id by the identifier
        return urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            posixpath.join(
                '/activities', 'commit/{commit}'.format(commit=commitsha)
            )
        )

    def default_id(self):
        """Configure calculated ID."""
        if self.commit:
            return self.generate_id(self.commit.hexsha)
        return self.generate_id('UNCOMMITED')

    @_message.default
    def default_message(self):
        """Generate a default message."""
        if self.commit:
            return self.commit.message

    @_was_informed_by.default
    def default_was_informed_by(self):
        """List parent actions."""
        if self.commit:
            return [{
                '@id': self.generate_id(parent),
            } for parent in self.commit.parents]

    @started_at_time.default
    def default_started_at_time(self):
        """Configure calculated properties."""
        if self.commit:
            return self.commit.authored_datetime.isoformat()

    @ended_at_time.default
    def default_ended_at_time(self):
        """Configure calculated properties."""
        if self.commit:
            return self.commit.committed_datetime.isoformat()

    @person_agent.default
    def default_person_agent(self):
        """Set person agent to be the author of the commit."""
        if self.commit:
            return Person.from_commit(self.commit)
        return None

    @property
    def nodes(self):
        """Return topologically sorted nodes."""
        collections = OrderedDict()

        def _parents(node):
            if node.parent:
                yield from _parents(node.parent)
                yield node.parent

        for output in self.generated:
            for parent in _parents(output.entity):
                collections[parent.path] = parent

            yield from _nodes(output)

        for removed in self.invalidated:
            for parent in _parents(removed):
                collections[parent.path] = parent

            yield from _nodes(removed)

        yield from reversed(collections.values())

    def __attrs_post_init__(self):
        """Sets ``generated`` and ``invalidated`` default values if needed."""
        super().__attrs_post_init__()
        if not self._id:
            self._id = self.default_id()
        if not self.generated:
            self.generated = self.default_generated()

        for g in self.generated:
            _set_entity_client_commit(g.entity, self.client, self.commit)

        if not self.invalidated:
            self.invalidated = self.default_invalidated()

        if self.generated:
            for g in self.generated:
                g._activity = weakref.ref(self)
class ProcessRun(Activity):
    """A process run is a particular execution of a Process description."""

    __association_cls__ = Run

    generated = jsonld.container.list(
        Generation,
        context={
            '@reverse': 'prov:activity',
        },
        kw_only=True,
        default=None
    )

    association = jsonld.ib(
        context='prov:qualifiedAssociation',
        default=None,
        kw_only=True,
        type=Association
    )

    annotations = jsonld.container.list(
        context={
            '@reverse': 'oa:hasTarget',
        }, kw_only=True, type=Annotation
    )

    qualified_usage = jsonld.container.list(
        Usage, context='prov:qualifiedUsage', kw_only=True, default=None
    )

    def __attrs_post_init__(self):
        """Calculate properties."""
        super().__attrs_post_init__()

        if not self.commit and self.client:
            self.commit = self.client.find_previous_commit(self.path)

        if not self.annotations:
            self.annotations = self.plugin_annotations()

        if self.association:
            self.association.plan._activity = weakref.ref(self)
            plan = self.association.plan
            if not plan.commit:
                if self.client:
                    plan.client = self.client
                if self.commit:
                    plan.commit = self.commit

                if plan.inputs:
                    for i in plan.inputs:
                        _set_entity_client_commit(
                            i.consumes, self.client, self.commit
                        )
                if plan.outputs:
                    for o in plan.outputs:
                        _set_entity_client_commit(
                            o.produces, self.client, self.commit
                        )

        if self.qualified_usage and self.client and self.commit:
            usages = []
            revision = '{0}'.format(self.commit)
            for usage in self.qualified_usage:
                if not usage.commit and '@UNCOMMITTED' in usage._label:
                    usages.append(
                        Usage.from_revision(
                            client=self.client,
                            path=usage.path,
                            role=usage.role,
                            revision=revision,
                            id=usage._id,
                        )
                    )
                else:
                    if not usage.client:
                        usage.entity.set_client(self.client)
                    if not usage.commit:
                        revision = usage._label.rsplit('@')[1]
                        usage.entity.commit = self.client.repo.commit(revision)

                    usages.append(usage)
            self.qualified_usage = usages

    def default_generated(self):
        """Create default ``generated``."""
        generated = []

        if not self.association or not self.association.plan:
            return generated

        for output in self.association.plan.outputs:
            entity = Entity.from_revision(
                self.client,
                output.produces.path,
                revision=self.commit,
                parent=output.produces.parent
            )

            generation = Generation(
                activity=self, role=output.sanitized_id, entity=entity
            )
            generated.append(generation)
        return generated

    def add_annotations(self, annotations):
        """Adds annotations from an external tool."""
        self.annotations.extend(annotations)

    def plugin_annotations(self):
        """Adds ``Annotation``s from plugins to a ``ProcessRun``."""
        from renku.core.plugins.pluginmanager import get_plugin_manager
        pm = get_plugin_manager()

        results = pm.hook.process_run_annotations(run=self)
        return [a for r in results for a in r]

    @classmethod
    def from_run(
        cls,
        run,
        client,
        path,
        commit=None,
        is_subprocess=False,
        update_commits=False
    ):
        """Convert a ``Run`` to a ``ProcessRun``."""
        from .agents import SoftwareAgent

        if not commit:
            commit = client.repo.head.commit

        usages = []

        id_ = ProcessRun.generate_id(commit)

        if is_subprocess:
            id_ = '{}/steps/step_{}'.format(id_, run.process_order)

        for input_ in run.inputs:
            usage_id = id_ + '/inputs/' + input_.sanitized_id
            revision = commit
            input_path = input_.consumes.path
            entity = input_.consumes
            if update_commits:
                revision = client.find_previous_commit(
                    input_path, revision=commit.hexsha
                )
                entity = Entity.from_revision(client, input_path, revision)

            dependency = Usage(
                entity=entity, role=input_.sanitized_id, id=usage_id
            )

            usages.append(dependency)

        agent = SoftwareAgent.from_commit(commit)
        association = Association(
            agent=agent, id=id_ + '/association', plan=run
        )

        process_run = cls(
            id=id_,
            qualified_usage=usages,
            association=association,
            client=client,
            commit=commit,
            path=path
        )

        generated = []

        for output in run.outputs:
            entity = Entity.from_revision(
                client,
                output.produces.path,
                revision=commit,
                parent=output.produces.parent
            )

            generation = Generation(
                activity=process_run, role=output.sanitized_id, entity=entity
            )
            generated.append(generation)

        process_run.generated = generated

        process_run.plugin_annotations()
        return process_run

    @property
    def parents(self):
        """Return parent commits."""
        return [
            member.commit for usage in self.qualified_usage
            for member in usage.entity.entities
        ] + super().parents

    @property
    def nodes(self):
        """Return topologically sorted nodes."""
        # Outputs go first
        yield from super().nodes

        # Activity itself
        yield self.association.plan
Beispiel #20
0
class Person:
    """Represent a person."""

    name = jsonld.ib(context='schema:name',
                     kw_only=True,
                     validator=instance_of(str))
    email = jsonld.ib(context='schema:email', default=None, kw_only=True)
    label = jsonld.ib(context='rdfs:label', kw_only=True)
    affiliation = jsonld.ib(default=None,
                            kw_only=True,
                            context='schema:affiliation')
    alternate_name = jsonld.ib(default=None,
                               kw_only=True,
                               context='schema:alternateName')
    _id = jsonld.ib(context='@id', kw_only=True)

    @_id.default
    def default_id(self):
        """Set the default id."""
        import string
        if self.email:
            return 'mailto:{email}'.format(email=self.email)

        # prep name to be a valid ntuple string
        name = self.name.translate(str.maketrans('', '', string.punctuation))
        name = ''.join(filter(lambda x: x in string.printable, name))
        return '_:{}'.format(''.join(name.lower().split()))

    @email.validator
    def check_email(self, attribute, value):
        """Check that the email is valid."""
        if self.email and not (isinstance(value, str)
                               and re.match(r'[^@]+@[^@]+\.[^@]+', value)):
            raise ValueError('Email address is invalid.')

    @label.default
    def default_label(self):
        """Set the default label."""
        return self.name

    @classmethod
    def from_commit(cls, commit):
        """Create an instance from a Git commit."""
        return cls(
            name=commit.author.name,
            email=commit.author.email,
        )

    @property
    def short_name(self):
        """Gives full name in short form."""
        names = self.name.split()
        if len(names) == 1:
            return self.name

        last_name = names[-1]
        initials = [name[0] for name in names]
        initials.pop()

        return '{0}.{1}'.format('.'.join(initials), last_name)

    @property
    def full_identity(self):
        """Return name, email, and affiliation."""
        email = f' <{self.email}>' if self.email else ''
        affiliation = f' [{self.affiliation}]' if self.affiliation else ''
        return f'{self.name}{email}{affiliation}'

    @classmethod
    def from_git(cls, git):
        """Create an instance from a Git repo."""
        git_config = git.config_reader()
        try:
            name = git_config.get_value('user', 'name', None)
            email = git_config.get_value('user', 'email', None)
        except (configparser.NoOptionError,
                configparser.NoSectionError):  # pragma: no cover
            raise errors.ConfigurationError(
                'The user name and email are not configured. '
                'Please use the "git config" command to configure them.\n\n'
                '\tgit config --global --add user.name "John Doe"\n'
                '\tgit config --global --add user.email '
                '"*****@*****.**"\n')

        # Check the git configuration.
        if not name:  # pragma: no cover
            raise errors.MissingUsername()
        if not email:  # pragma: no cover
            raise errors.MissingEmail()

        return cls(name=name, email=email)

    @classmethod
    def from_string(cls, string):
        """Create an instance from a 'Name <email>' string."""
        regex_pattern = r'([^<>\[\]]*)' \
            r'(?:<{1}\s*(\S+@\S+\.\S+){0,1}\s*>{1}){0,1}\s*' \
            r'(?:\[{1}(.*)\]{1}){0,1}'
        name, email, affiliation = re.search(regex_pattern, string).groups()
        if name:
            name = name.strip()
        if affiliation:
            affiliation = affiliation.strip()
        affiliation = affiliation or None

        return cls(name=name, email=email, affiliation=affiliation)

    @classmethod
    def from_dict(cls, obj):
        """Create and instance from a dictionary."""
        return cls(**obj)

    @classmethod
    def from_jsonld(cls, data):
        """Create an instance from JSON-LD data."""
        if isinstance(data, cls):
            return data
        if not isinstance(data, dict):
            raise ValueError(data)

        return PersonSchema().load(data)

    def __attrs_post_init__(self):
        """Finish object initialization."""
        # handle the case where ids were improperly set
        if self._id == 'mailto:None' or self._id is None:
            self._id = self.default_id()

        if self.label is None:
            self.label = self.default_label()
Beispiel #21
0
class Project(object):
    """Represent a project."""

    name = jsonld.ib(default=None, context='schema:name')

    created = jsonld.ib(
        converter=parse_date,
        context='schema:dateCreated',
    )

    updated = jsonld.ib(
        converter=parse_date,
        context='schema:dateUpdated',
    )

    version = jsonld.ib(
        converter=str,
        default='2',
        context='schema:schemaVersion',
    )

    client = attr.ib(default=None, kw_only=True)

    creator = jsonld.ib(default=None,
                        kw_only=True,
                        context={
                            '@id': 'schema:creator',
                        },
                        type=Person)

    _id = jsonld.ib(context='@id', kw_only=True, default=None)

    @created.default
    @updated.default
    def _now(self):
        """Define default value for datetime fields."""
        return datetime.datetime.now(datetime.timezone.utc)

    def __attrs_post_init__(self):
        """Initialize computed attributes."""
        if not self.creator and self.client:
            if self.client.renku_metadata_path.exists():
                self.creator = Person.from_commit(
                    self.client.find_previous_commit(
                        self.client.renku_metadata_path, return_first=True), )
            else:
                # this assumes the project is being newly created
                self.creator = Person.from_git(self.client.repo)

        self._id = self.project_id

    @property
    def project_id(self):
        """Return the id for the project based on the repo origin remote."""
        import pathlib
        import urllib

        # Determine the hostname for the resource URIs.
        # If RENKU_DOMAIN is set, it overrides the host from remote.
        # Default is localhost.
        host = 'localhost'
        owner = self.creator.email.split('@')[0] if self.creator else 'NULL'
        name = self.name

        if self.client:
            remote = self.client.remote
            host = self.client.remote.get('host') or host
            owner = remote.get('owner') or owner
            name = remote.get('name') or name
        host = os.environ.get('RENKU_DOMAIN') or host
        if name:
            name = urllib.parse.quote(name, safe='')
        project_url = urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            pathlib.posixpath.join(PROJECT_URL_PATH, owner, name or 'NULL'))
        return project_url
Beispiel #22
0
class CommitMixin:
    """Represent a commit mixin."""

    commit = attr.ib(default=None, kw_only=True)
    client = attr.ib(default=None, kw_only=True)
    path = jsonld.ib(
        context='prov:atLocation',
        default=None,
        kw_only=True,
        converter=_str_or_none
    )

    _id = jsonld.ib(default=None, context='@id', kw_only=True)
    _label = jsonld.ib(context='rdfs:label', kw_only=True)
    _project = jsonld.ib(
        context='schema:isPartOf', type=Project, kw_only=True, default=None
    )

    @property
    def submodules(self):
        """Proxy to client submodules."""
        return self.client.submodules

    def default_id(self):
        """Configure calculated ID."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = 'UNCOMMITTED'

        # Determine the hostname for the resource URIs.
        # If RENKU_DOMAIN is set, it overrides the host from remote.
        # Default is localhost.
        host = 'localhost'
        if self.client:
            host = self.client.remote.get('host') or host
        host = os.environ.get('RENKU_DOMAIN') or host

        # always set the id by the identifier
        return urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            pathlib.posixpath.join(
                '/blob/{hexsha}/{path}'.format(hexsha=hexsha, path=self.path)
            )
        )

    @_label.default
    def default_label(self):
        """Generate a default label."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = 'UNCOMMITTED'
        if self.path:
            return '{self.path}@{hexsha}'.format(hexsha=hexsha, self=self)
        return '{hexsha}'.format(hexsha=hexsha, self=self)

    def __attrs_post_init__(self):
        """Post-init hook."""
        if self.path:
            path = pathlib.Path(self.path)
            if path.is_absolute():
                self.path = str(path.relative_to(self.client.path))

        # always force "project" to be the current project
        if self.client:
            self._project = self.client.project

        if not self._id:
            self._id = self.default_id()
class ProcessRun(Activity):
    """A process run is a particular execution of a Process description."""

    __association_cls__ = Process

    inputs = attr.ib(default=None, kw_only=True)
    outputs = attr.ib(default=None, kw_only=True)

    generated = jsonld.container.list(Generation,
                                      context={
                                          '@reverse': 'prov:activity',
                                      },
                                      kw_only=True,
                                      default=None)

    association = jsonld.ib(context='prov:qualifiedAssociation',
                            default=None,
                            kw_only=True,
                            type=Association)

    annotations = jsonld.container.list(context={
        '@reverse': 'oa:hasTarget',
    },
                                        kw_only=True,
                                        type=Annotation)

    qualified_usage = jsonld.ib(default=None,
                                context='prov:qualifiedUsage',
                                kw_only=True,
                                type=Usage)

    def __attrs_post_init__(self):
        """Calculate properties."""
        super().__attrs_post_init__()

        if not self.inputs:
            self.inputs = self.default_inputs()

        if not self.qualified_usage:
            self.qualified_usage = self.default_qualified_usage()

        if self.association is None:
            self.association = Association.from_activity(self)

        if not self.annotations:
            if (hasattr(self.process, 'annotations')
                    and self.process.annotations):
                self.annotations = self.process.annotations

            self.annotations.extend(self.plugin_annotations())

        if self.path is None:
            # FIXME only works for linking directory to file
            existing_outputs = set(self.outputs.values())
            for output_id, output_path in self.iter_output_files():
                if output_id not in existing_outputs:
                    self.outputs[os.path.join(
                        next(path for path, usage in self.inputs.items()
                             if usage.role == 'input_directory'),
                        output_path)] = output_id
                    break

    def plugin_annotations(self):
        """Adds ``Annotation``s from plugins to a ``ProcessRun``."""
        from renku.core.plugins.pluginmanager import get_plugin_manager
        pm = get_plugin_manager()

        results = pm.hook.process_run_annotations(run=self)
        return [a for r in results for a in r]

    def default_inputs(self):
        """Guess default inputs from a process."""
        inputs = {}
        basedir = os.path.dirname(self.path)

        commit = self.commit
        client = self.client
        process = self.process

        revision = '{0}^'.format(commit)

        for input_id, input_path in process.iter_input_files(basedir):
            try:
                usage_id = self._id + '/inputs/' + input_id
                dependency = Usage.from_revision(
                    client=client,
                    path=input_path,
                    role=input_id,
                    revision=revision,
                    id=usage_id,
                )
                inputs[input_path] = dependency
            except KeyError:
                continue

        return inputs

    def default_qualified_usage(self):
        """Generate list of used artifacts."""
        return list(self.inputs.values())

    def iter_output_files(self, commit=None):
        """Yield tuples with output id and path."""
        process = self.process

        for output in process.outputs:
            if output.type in {'stdout', 'stderr'}:
                stream = getattr(process, output.type)
                if stream:
                    yield output.id, stream
            elif output.type in PATH_OBJECTS:
                glob = output.outputBinding.glob
                # TODO better support for Expression
                if glob.startswith('$(inputs.'):
                    input_id = glob[len('$(inputs.'):-1]
                    for input_ in process.inputs:
                        if input_.id == input_id:
                            yield output.id, input_.default
                            break  # out from process.inputs
                else:
                    yield output.id, glob

    def default_outputs(self):
        """Guess default outputs from a process."""
        if self.path is None:
            return {}
        return {
            output_path: output_id
            for output_id, output_path in self.iter_output_files()
        }

    @property
    def parents(self):
        """Return parent commits."""
        return [
            member.commit for usage in self.qualified_usage
            for member in usage.entity.entities
        ] + super().parents

    @property
    def nodes(self):
        """Return topologically sorted nodes."""
        # Outputs go first
        yield from super().nodes

        # Activity itself
        yield self.association.plan