class Generation(EntityProxyMixin): """Represent an act of generating a file.""" entity = jsonld.ib(context={ '@reverse': 'prov:qualifiedGeneration', }, type=[ 'renku.core.models.entities.Entity', 'renku.core.models.entities.Collection', Dataset, DatasetFile ]) role = jsonld.ib(context='prov:hadRole', default=None) _activity = attr.ib( default=None, kw_only=True, converter=lambda value: weakref.ref(value) if value is not None else None, ) _id = jsonld.ib(context='@id', kw_only=True) @property def activity(self): """Return the activity object.""" return self._activity() if self._activity is not None else None @_id.default def default_id(self): """Configure calculated ID.""" if self.role: return '{self.activity._id}/outputs/{self.role}'.format( self=self, ) return '{self.activity._id}/tree/{self.entity.path}'.format( self=self, )
class MappedIOStream(object): """Represents an IO stream (stdin, stdout, stderr).""" _id = jsonld.ib(context='@id', kw_only=True) _label = jsonld.ib(default=None, context='rdfs:label', kw_only=True) STREAMS = ['stdin', 'stdout', 'stderr'] stream_type = jsonld.ib( context={ '@id': 'renku:streamType', '@type': 'http://www.w3.org/2001/XMLSchema#string', }, type=str, kw_only=True, ) @_id.default def default_id(self): """Set default id.""" # TODO: make bnode ids nicer once this issue is in a release: # https://github.com/RDFLib/rdflib/issues/888 # right now it's limited to a-zA-Z0-9 (-_ will work once it's fixed) return '_:MappedIOStream-{}'.format(str(uuid.uuid4())).replace('-', '') def default_label(self): """Set default label.""" return 'Stream mapping for stream "{}"'.format(self.stream_type) def __attrs_post_init__(self): """Post-init hook.""" if not self._label: self._label = self.default_label()
class Association: """Assign responsibility to an agent for an activity.""" plan = jsonld.ib(context='prov:hadPlan', type='renku.core.models.provenance.processes.Process') agent = jsonld.ib(context='prov:agent', default=None, type='renku.core.models.provenance.agents.SoftwareAgent') _id = jsonld.ib(context='@id', kw_only=True) @classmethod def from_activity(cls, activity, commit=None): """Create an instance from the activity.""" from .agents import SoftwareAgent agent = SoftwareAgent.from_commit(activity.commit) return cls( plan=activity.__association_cls__( commit=commit or activity.commit, client=activity.client, path=activity.path, activity=activity, ), agent=agent, id=activity._id + '/association', # add plan and agent )
class CommandParameter(object): """Represents a parameter for an execution template.""" _id = jsonld.ib(default=None, context='@id', kw_only=True) _label = jsonld.ib(default=None, context='rdfs:label', kw_only=True) position = jsonld.ib( default=None, context={ '@id': 'renku:position', '@type': 'http://www.w3.org/2001/XMLSchema#integer', }, type=int, kw_only=True, ) prefix = jsonld.ib( default=None, context={ '@id': 'renku:prefix', '@type': 'http://www.w3.org/2001/XMLSchema#string', }, type=str, kw_only=True, ) @property def sanitized_id(self): """Return ``_id`` sanitized for use in non-jsonld contexts.""" return self._id.split(':', 1)[1].replace('-', '_')
class Annotation: """Represents a custom annotation for a research object.""" _id = jsonld.ib(context='@id', kw_only=True) body = jsonld.ib(default=None, context='oa:hasBody', kw_only=True) source = jsonld.ib(default=None, context='dcterms:creator', kw_only=True)
class CommitMixin: """Represent a commit mixin.""" commit = attr.ib(default=None, kw_only=True) client = attr.ib(default=None, kw_only=True) path = jsonld.ib( context='prov:atLocation', default=None, kw_only=True, converter=_str_or_none ) _id = jsonld.ib(context='@id', kw_only=True) _label = jsonld.ib(context='rdfs:label', kw_only=True) _project = jsonld.ib( context='schema:isPartOf', type=Project, kw_only=True, default=None ) @property def submodules(self): """Proxy to client submodules.""" return self.client.submodules @_id.default def default_id(self): """Configure calculated ID.""" if self.commit: hexsha = self.commit.hexsha else: hexsha = 'UNCOMMITTED' return 'file://blob/{hexsha}/{self.path}'.format( hexsha=hexsha, self=self ) @_label.default def default_label(self): """Generate a default label.""" if self.commit: hexsha = self.commit.hexsha else: hexsha = 'UNCOMMITTED' if self.path: return '{self.path}@{hexsha}'.format(hexsha=hexsha, self=self) return '{hexsha}'.format(hexsha=hexsha, self=self) def __attrs_post_init__(self): """Post-init hook.""" if self.path: path = Path(self.path) if path.is_absolute(): self.path = str(path.relative_to(self.client.path)) # always force "project" to be the current project if self.client: self._project = self.client.project
class CommandOutput(CommandParameter): """An output of a command.""" create_folder = jsonld.ib(default=False, context='renku:createFolder', kw_only=True, type=bool) produces = jsonld.ib(context='renku:produces', kw_only=True, type=[ 'renku.core.models.entities.Entity', 'renku.core.models.entities.Collection' ]) mapped_to = jsonld.ib(default=None, context='prov:mappedTo', kw_only=True, type=MappedIOStream) def default_id(self): """Set default id.""" return '_:CommandOutput-{}'.format(str(uuid.uuid4())).replace('-', '') def default_label(self): """Set default label.""" return 'Command Output "{}"'.format(self.produces.path) def to_argv(self): """String representation (sames as cmd argument).""" if self.prefix: if self.prefix.endswith(' '): return [self.prefix[:-1], self.produces.path] return ['{}{}'.format(self.prefix, self.produces.path)] return [self.produces.path] def to_stream_repr(self): """Input stream representation.""" if not self.mapped_to: return '' if self.mapped_to.stream_type == 'stdout': return ' > {}'.format(self.produces.path) return ' 2> {}'.format(self.produces.path) def __attrs_post_init__(self): """Post-init hook.""" if not self._id: self._id = self.default_id() if not self._label: self._label = self.default_label()
class Collection(Entity): """Represent a directory with files.""" members = jsonld.ib(context='prov:hadMember', kw_only=True) @members.default def default_members(self): """Generate default members as entities from current path.""" dir_path = self.client.path / self.path assert dir_path.is_dir() members = [] for path in dir_path.iterdir(): if path.name == '.gitkeep': continue # ignore empty directories in Git repository cls = Collection if path.is_dir() else Entity members.append( cls( commit=self.commit, client=self.client, path=str(path.relative_to(self.client.path)), parent=self, ) ) return members @property def entities(self): """Recursively return all files.""" for member in self.members: yield from member.entities yield self
class Workflow(Process): """Represent workflow with subprocesses.""" subprocesses = jsonld.ib(context='wfdesc:hasSubProcess', kw_only=True) @subprocesses.default def default_subprocesses(self): """Load subprocesses.""" return [ subprocess.association.plan for subprocess in self.activity.subprocesses.values() ]
class Usage(EntityProxyMixin): """Represent a dependent path.""" entity = jsonld.ib(context='prov:entity', kw_only=True, type=[ 'renku.core.models.entities.Entity', 'renku.core.models.entities.Collection', Dataset, DatasetFile ]) role = jsonld.ib(context='prov:hadRole', default=None, kw_only=True) _id = jsonld.ib(context='@id', default=None, kw_only=True) @classmethod def from_revision(cls, client, path, revision='HEAD', **kwargs): """Return dependency from given path and revision.""" from renku.core.models.entities import Entity return cls(entity=Entity.from_revision(client, path, revision), **kwargs)
class Process(CommitMixin): """Represent a process.""" _activity = jsonld.ib( context='prov:activity', kw_only=True, converter=weakref.ref, type='renku.core.models.provenance.activities.Activity') @property def activity(self): """Return the activity object.""" return self._activity()
class SoftwareAgent: """Represent executed software.""" label = jsonld.ib(context='rdfs:label', kw_only=True) was_started_by = jsonld.ib( context='prov:wasStartedBy', default=None, kw_only=True, ) _id = jsonld.ib(context='@id', kw_only=True) @classmethod def from_commit(cls, commit): """Create an instance from a Git commit.""" author = Person.from_commit(commit) if commit.author != commit.committer: return cls( label=commit.committer.name, id=commit.committer.email, was_started_by=author, ) return author
class Process(CommitMixin): """Represent a process.""" _activity = jsonld.ib( default=None, context='prov:activity', kw_only=True, converter=lambda value: weakref.ref(value) if value is not None else None, type='renku.core.models.provenance.activities.Activity') @property def activity(self): """Return the activity object.""" return self._activity()
class CommandArgument(CommandParameter): """An argument to a command that is neither input nor output.""" value = jsonld.ib( default=None, context={ '@id': 'renku:value', '@type': 'http://www.w3.org/2001/XMLSchema#string', }, type=str, kw_only=True, ) def default_id(self): """Set default id.""" return '_:CommandArgument-{}'.format(str(uuid.uuid4())).replace( '-', '') def default_label(self): """Set default label.""" return 'Command Argument "{}"'.format(self.value) def to_argv(self): """String representation (sames as cmd argument).""" if self.prefix: if self.prefix.endswith(' '): return [self.prefix[:-1], self.value] return ['{}{}'.format(self.prefix, self.value)] return [self.value] def __attrs_post_init__(self): """Post-init hook.""" if not self._id: self._id = self.default_id() if not self._label: self._label = self.default_label()
class WorkflowRun(ProcessRun): """A workflow run typically contains several subprocesses.""" __association_cls__ = Workflow # @reverse wfprov:wasPartOfWorkflowRun children = attr.ib(kw_only=True) _processes = jsonld.ib(context={ '@reverse': 'wfprov:wasPartOfWorkflowRun', }, default=attr.Factory(list), kw_only=True, type=Process) subprocesses = attr.ib(default=None, kw_only=True) outputs = attr.ib(default=None, kw_only=True) generated = jsonld.container.list(Generation, context={ '@reverse': 'prov:activity', }, kw_only=True) @children.default def default_children(self): """Load children from process.""" basedir = os.path.dirname(self.path) if self.path is not None else None def _load(step): """Load step definition.""" if isinstance(step.run, WORKFLOW_STEP_RUN_TYPES): return step.run if self.commit: import yaml data = (self.commit.tree / basedir / step.run).data_stream.read() return CWLClass.from_cwl(yaml.safe_load(data)) return CWLClass.from_yaml(step.run) return {step.id: _load(step) for step in self.process.steps} def default_subprocesses(self): """Load subprocesses.""" basedir = os.path.dirname(self.path) revision = '{0}^'.format(self.commit) ins = { dependency.role: dependency for path, dependency in self.inputs.items() if isinstance(dependency, Usage) } entities = {} outs = {} subprocesses = {} for step in reversed(self.process.topological_steps): if isinstance(step.run, WORKFLOW_STEP_RUN_TYPES): path = None process = step.run else: path = os.path.join(basedir, step.run) process = self.children[step.id] subprocess_id = self._id + '/steps/' + step.id inputs = {} for alias, source in step.in_.items(): usage_id = subprocess_id + '/inputs/' + alias if source in ins: dependency = ins[source] inputs[dependency.path] = attr.evolve( dependency, role=alias, id=usage_id, ) elif source in outs: input_path = outs[source] inputs[input_path] = Usage( entity=entities[input_path], role=alias, id=usage_id, ) else: # TODO check that it is not Path or Directory pass subprocess_entity_commit = self.client.find_previous_commit( path, revision=revision) subprocess = process.create_run( commit=self.commit, client=self.client, part_of=self, process=process, path=path, inputs=inputs, id=subprocess_id, ) subprocess.association = Association.from_activity( subprocess, commit=subprocess_entity_commit, ) for output_path, source in subprocess.outputs.items(): outs.setdefault(step.id + '/' + source, output_path) for generation in subprocess.generated: entity = generation.entity entities[entity.path] = entity if isinstance(entity, Collection): entities.update( **{member.path: member for member in entity.members}) subprocesses[step.id] = subprocess self._processes.append(subprocess) return subprocesses def iter_output_files(self, commit=None): """Yield tuples with output id and path.""" commit = commit or self.commit tools = self.default_children() setattr(self, 'children', tools) for output in self.process.outputs: if output.type not in PATH_OBJECTS: continue if output.outputSource: step_id, _, source = output.outputSource.partition('/') subprocess = self.subprocesses[step_id] for glob, output_id in subprocess.outputs.items(): if output.id == output_id: yield output.id, glob break elif output.outputBinding: glob = output.outputBinding.glob # TODO better support for Expression if glob.startswith('$(inputs.'): input_id = glob[len('$(inputs.'):-1] for input_ in self.inputs: if input_.id == input_id: yield output.id, input_.default else: yield output.id, glob def default_generated(self): """Calculate default values.""" results = [] for output in self.process.outputs: step_id, _, source = output.outputSource.partition('/') assert step_id in self.children for generated in self.subprocesses[step_id].generated: if generated.role == source: results.append( attr.evolve( generated, role=output.id, activity=self, )) break else: raise KeyError(output) return results @property def nodes(self): """Yield all graph nodes.""" for subprocess in reversed(self._processes): if subprocess.path is None: # skip nodes connecting directory to file continue yield from subprocess.nodes def __attrs_post_init__(self): """Attrs post initializations.""" if not self._id: self._id = self.default_id() if not self.inputs: self.inputs = self.default_inputs() if not self.subprocesses: self.subprocesses = self.default_subprocesses() if not self.generated: self.generated = self.default_generated() super().__attrs_post_init__()
class Project(ReferenceMixin): """Represent a project.""" name = jsonld.ib(default=None, context='schema:name') created = jsonld.ib(converter=parse_date, context='schema:dateCreated') updated = jsonld.ib(converter=parse_date, context='schema:dateUpdated') version = jsonld.ib(converter=str, default=str(SUPPORTED_PROJECT_VERSION), context='schema:schemaVersion') client = attr.ib(default=None, kw_only=True) creator = jsonld.ib(default=None, kw_only=True, context={ '@id': 'schema:creator', }, type=Person) _id = jsonld.ib(context='@id', kw_only=True, default=None) @created.default @updated.default def _now(self): """Define default value for datetime fields.""" return datetime.datetime.now(datetime.timezone.utc) def __attrs_post_init__(self): """Initialize computed attributes.""" if not self.creator and self.client: if self.client.renku_metadata_path.exists(): self.creator = Person.from_commit( self.client.find_previous_commit( self.client.renku_metadata_path, return_first=True), ) else: # this assumes the project is being newly created self.creator = Person.from_git(self.client.repo) try: self._id = self.project_id except ValueError: """Fallback to old behaviour.""" if self._id: pass elif self.client and self.client.is_project_set(): self._id = self.client.project._id else: raise @property def project_id(self): """Return the id for the project based on the repo origin remote.""" import pathlib import urllib # Determine the hostname for the resource URIs. # If RENKU_DOMAIN is set, it overrides the host from remote. # Default is localhost. host = 'localhost' if not self.creator: raise ValueError('Project Creator not set') owner = self.creator.email.split('@')[0] name = self.name if self.client: remote = self.client.remote host = self.client.remote.get('host') or host owner = remote.get('owner') or owner name = remote.get('name') or name host = os.environ.get('RENKU_DOMAIN') or host if name: name = urllib.parse.quote(name, safe='') else: raise ValueError('Project name not set') project_url = urllib.parse.urljoin( 'https://{host}'.format(host=host), pathlib.posixpath.join(PROJECT_URL_PATH, owner, name or 'NULL')) return project_url @classmethod def from_yaml(cls, path, client=None): """Return an instance from a YAML file.""" data = jsonld.read_yaml(path) self = cls.from_jsonld(data=data, client=client) self.__reference__ = path return self @classmethod def from_jsonld(cls, data, client=None): """Create an instance from JSON-LD data.""" if isinstance(data, cls): return data if not isinstance(data, dict): raise ValueError(data) return ProjectSchema(client=client).load(data) def to_yaml(self): """Write an instance to the referenced YAML file.""" data = ProjectSchema().dump(self) jsonld.write_yaml(path=self.__reference__, data=data) def as_jsonld(self): """Create JSON-LD.""" return ProjectSchema().dump(self)
class Run(CommitMixin): """Represents a `renku run` execution template.""" command = jsonld.ib( default=None, context={ '@id': 'renku:command', '@type': 'http://www.w3.org/2001/XMLSchema#string', }, type=str, kw_only=True, ) process_order = jsonld.ib( default=None, context={ '@id': 'renku:processOrder', '@type': 'http://www.w3.org/2001/XMLSchema#integer', }, type=int, kw_only=True, ) successcodes = jsonld.container.list(context='renku:successCodes', kw_only=True, type=int) subprocesses = jsonld.container.list('renku.core.models.workflow.run.Run', context='renku:hasSubprocess', kw_only=True) arguments = jsonld.container.list(context='renku:hasArguments', kw_only=True, type=CommandArgument) inputs = jsonld.container.list(context='renku:hasInputs', kw_only=True, type=CommandInput) outputs = jsonld.container.list(context='renku:hasOutputs', kw_only=True, type=CommandOutput) @classmethod def from_factory(cls, factory, client, commit, path): """Creates a ``Run`` from a ``CommandLineToolFactory``.""" inputs = [] arguments = [] outputs = [ _convert_cmd_output(o, factory, client, commit) for o in factory.outputs ] # TODO: handle stream! if outputs: outputs, inputs_to_remove = zip(*outputs) outputs = list(outputs) for i in inputs_to_remove: # remove inputs that are actually outputs # note: a single input can represent multiple outputs # in case of repetition in the cli if not i: continue if i in factory.inputs: factory.inputs.remove(i) for i in factory.inputs: res = _convert_cmd_input(i, client, commit) if isinstance(res, CommandInput): inputs.append(res) else: arguments.append(res) return cls( client=client, commit=commit, path=path, command=' '.join(factory.baseCommand), successcodes=factory.successCodes, arguments=[_convert_cmd_binding(a) for a in factory.arguments] + arguments, inputs=inputs, outputs=outputs) @property def activity(self): """Return the activity object.""" return self._activity() def to_argv(self): """Convert run into argv list.""" argv = [] if self.command: argv.extend(self.command.split(' ')) arguments = self.inputs + self.outputs + self.arguments arguments = filter(lambda x: x.position, arguments) arguments = sorted(arguments, key=lambda x: x.position) argv.extend(e for a in arguments for e in a.to_argv()) return argv def to_stream_repr(self): """Input/output stream representation.""" stream_repr = [] for input_ in self.inputs: if input_.mapped_to: stream_repr.append(input_.to_stream_repr()) for output in self.outputs: if output.mapped_to: stream_repr.append(output.to_stream_repr()) return stream_repr def update_id_and_label_from_commit_path(self, client, commit, path): """Updates the _id and _label using supplied commit and path.""" self.client = client if not self.commit: self.commit = commit path = Path(os.path.abspath(path)).relative_to(self.client.path) self.path = path self._id = self.default_id() self._label = self.default_label() if len(self.subprocesses) > 0: for s in self.subprocesses: s.update_id_and_label_from_commit_path(client, commit, path) def add_subprocess(self, subprocess, process_order=None): """Adds a subprocess to this run.""" if not process_order: process_order = 0 if self.subprocesses: # sort subprocesses by dependencies process_order = bisect(self.subprocesses, subprocess) if process_order < len(self.subprocesses): # inserted before end, recalculate orders or rest for s in self.subprocesses: if s.process_order >= process_order: s.process_order += 1 if any(s.process_order == process_order for s in self.subprocesses): raise ValueError( 'process_order {} already exists'.format(process_order)) subprocess.process_order = process_order input_paths = [i.consumes.path for i in self.inputs] output_paths = [o.produces.path for o in self.outputs] for input_ in subprocess.inputs: if (input_.consumes.path not in input_paths and input_.consumes.path not in output_paths): new_input = copy(input_) new_input.mapped_to = None matching_output = next( (o for o in self.outputs if o.produces.path == new_input.consumes.path), None) if not matching_output: self.inputs.append(new_input) input_paths.append(new_input.consumes.path) for output in subprocess.outputs: if output.produces.path not in output_paths: new_output = copy(output) new_output.mapped_to = None self.outputs.append(new_output) output_paths.append(new_output.produces.path) matching_input = next( (i for i in self.inputs if i.consumes.path == new_output.produces.path), None) if matching_input: self.inputs.remove(matching_input) input_paths.remove(matching_input.consumes.path) self.subprocesses.append(subprocess) self.subprocesses = sorted(self.subprocesses, key=lambda s: s.process_order) def __lt__(self, other): """Compares two subprocesses order based on their dependencies.""" a_inputs = set() b_outputs = set() for i in other.inputs: entity = i.consumes for subentity in entity.entities: a_inputs.add(subentity.path) for i in self.outputs: entity = i.produces for subentity in entity.entities: b_outputs.add(subentity.path) return a_inputs & b_outputs
class Activity(CommitMixin): """Represent an activity in the repository.""" _id = jsonld.ib(default=None, context='@id', kw_only=True) _message = jsonld.ib(context='rdfs:comment', kw_only=True) _was_informed_by = jsonld.ib( context='prov:wasInformedBy', kw_only=True, ) part_of = attr.ib(default=None, kw_only=True) _collections = attr.ib( default=attr.Factory(OrderedDict), init=False, kw_only=True ) generated = jsonld.container.list( Generation, context={ '@reverse': 'prov:activity', }, kw_only=True ) invalidated = jsonld.container.list( Entity, context={ '@reverse': 'prov:wasInvalidatedBy', }, kw_only=True ) influenced = jsonld.ib( context='prov:influenced', kw_only=True, ) started_at_time = jsonld.ib( context={ '@id': 'prov:startedAtTime', '@type': 'http://www.w3.org/2001/XMLSchema#dateTime', }, kw_only=True, ) ended_at_time = jsonld.ib( context={ '@id': 'prov:endedAtTime', '@type': 'http://www.w3.org/2001/XMLSchema#dateTime', }, kw_only=True, ) agent = jsonld.ib( context='prov:wasAssociatedWith', kw_only=True, default=renku_agent, type='renku.core.models.provenance.agents.SoftwareAgent' ) person_agent = jsonld.ib( context='prov:wasAssociatedWith', kw_only=True, type='renku.core.models.provenance.agents.Person' ) def default_generated(self): """Create default ``generated``.""" generated = [] for path in self.get_output_paths(): entity = self._get_activity_entity(path) generated.append( Generation(activity=self, entity=entity, role=None) ) return generated def get_output_paths(self): """Gets all output paths generated by this run.""" index = set() commit = self.commit if not self.commit: if not self.client: return index commit = self.client.repo.head.commit for file_ in commit.diff(commit.parents or NULL_TREE): # ignore deleted files (note they appear as ADDED) # in this backwards diff if file_.change_type == 'A': continue path_ = Path(file_.a_path) is_dataset = self.client.DATASETS in str(path_) not_refs = LinkReference.REFS not in str(path_) does_not_exists = not path_.exists() if all([is_dataset, not_refs, does_not_exists]): uid = uuid.UUID(path_.parent.name) path_ = ( Path(self.client.renku_home) / self.client.DATASETS / str(uid) / self.client.METADATA ) index.add(str(path_)) return index def _get_activity_entity(self, path, deleted=False): """Gets the entity associated with this Activity and path.""" client, commit, path = self.client.resolve_in_submodules( self.commit, path, ) output_path = client.path / path parents = list(output_path.relative_to(client.path).parents) collection = None members = [] for parent in reversed(parents[:-1]): if str(parent) in self._collections: collection = self._collections[str(parent)] else: collection = Collection( client=client, commit=commit, path=str(parent), members=[], parent=collection, ) members.append(collection) self._collections[str(parent)] = collection members = collection.members entity_cls = Entity if (self.client.path / path).is_dir(): entity_cls = Collection # TODO: use a factory method to generate the entity if str(path).startswith( os.path.join(client.renku_home, client.DATASETS) ) and not deleted: entity = client.load_dataset_from_path(path, commit=commit) else: entity = entity_cls( commit=commit, client=client, path=str(path), parent=collection, ) if collection: collection.members.append(entity) return entity def default_invalidated(self): """Entities invalidated by this Action.""" results = [] for path in self.removed_paths: entity = self._get_activity_entity(path, deleted=True) results.append(entity) return results @influenced.default def default_influenced(self): """Calculate default values.""" return list(self._collections.values()) @property def parents(self): """Return parent commits.""" if self.commit: return list(self.commit.parents) @property def removed_paths(self): """Return all paths removed in the commit.""" index = set() if not self.commit: return index for file_ in self.commit.diff(self.commit.parents or NULL_TREE): # only process deleted files (note they appear as ADDED) # in this backwards diff if file_.change_type != 'A': continue path_ = Path(file_.a_path) index.add(str(path_)) return index @property def paths(self): """Return all paths in the commit.""" index = set() for file_ in self.commit.diff(self.commit.parents or NULL_TREE): # ignore deleted files (note they appear as ADDED) # in this backwards diff if file_.change_type == 'A': continue path_ = Path(file_.a_path) is_dataset = self.client.DATASETS in str(path_) not_refs = LinkReference.REFS not in str(path_) does_not_exists = not ( path_.exists() or (path_.is_symlink() and os.path.lexists(path_)) ) if all([is_dataset, not_refs, does_not_exists]): uid = uuid.UUID(path_.parent.name) path_ = ( Path(self.client.renku_home) / self.client.DATASETS / str(uid) / self.client.METADATA ) index.add(str(path_)) return index @classmethod def generate_id(cls, commitsha): """Calculate action ID.""" host = 'localhost' if hasattr(cls, 'client'): host = cls.client.remote.get('host') or host host = os.environ.get('RENKU_DOMAIN') or 'localhost' # always set the id by the identifier return urllib.parse.urljoin( 'https://{host}'.format(host=host), posixpath.join( '/activities', 'commit/{commit}'.format(commit=commitsha) ) ) def default_id(self): """Configure calculated ID.""" if self.commit: return self.generate_id(self.commit.hexsha) return self.generate_id('UNCOMMITED') @_message.default def default_message(self): """Generate a default message.""" if self.commit: return self.commit.message @_was_informed_by.default def default_was_informed_by(self): """List parent actions.""" if self.commit: return [{ '@id': self.generate_id(parent), } for parent in self.commit.parents] @started_at_time.default def default_started_at_time(self): """Configure calculated properties.""" if self.commit: return self.commit.authored_datetime.isoformat() @ended_at_time.default def default_ended_at_time(self): """Configure calculated properties.""" if self.commit: return self.commit.committed_datetime.isoformat() @person_agent.default def default_person_agent(self): """Set person agent to be the author of the commit.""" if self.commit: return Person.from_commit(self.commit) return None @property def nodes(self): """Return topologically sorted nodes.""" collections = OrderedDict() def _parents(node): if node.parent: yield from _parents(node.parent) yield node.parent for output in self.generated: for parent in _parents(output.entity): collections[parent.path] = parent yield from _nodes(output) for removed in self.invalidated: for parent in _parents(removed): collections[parent.path] = parent yield from _nodes(removed) yield from reversed(collections.values()) def __attrs_post_init__(self): """Sets ``generated`` and ``invalidated`` default values if needed.""" super().__attrs_post_init__() if not self._id: self._id = self.default_id() if not self.generated: self.generated = self.default_generated() for g in self.generated: _set_entity_client_commit(g.entity, self.client, self.commit) if not self.invalidated: self.invalidated = self.default_invalidated() if self.generated: for g in self.generated: g._activity = weakref.ref(self)
class ProcessRun(Activity): """A process run is a particular execution of a Process description.""" __association_cls__ = Run generated = jsonld.container.list( Generation, context={ '@reverse': 'prov:activity', }, kw_only=True, default=None ) association = jsonld.ib( context='prov:qualifiedAssociation', default=None, kw_only=True, type=Association ) annotations = jsonld.container.list( context={ '@reverse': 'oa:hasTarget', }, kw_only=True, type=Annotation ) qualified_usage = jsonld.container.list( Usage, context='prov:qualifiedUsage', kw_only=True, default=None ) def __attrs_post_init__(self): """Calculate properties.""" super().__attrs_post_init__() if not self.commit and self.client: self.commit = self.client.find_previous_commit(self.path) if not self.annotations: self.annotations = self.plugin_annotations() if self.association: self.association.plan._activity = weakref.ref(self) plan = self.association.plan if not plan.commit: if self.client: plan.client = self.client if self.commit: plan.commit = self.commit if plan.inputs: for i in plan.inputs: _set_entity_client_commit( i.consumes, self.client, self.commit ) if plan.outputs: for o in plan.outputs: _set_entity_client_commit( o.produces, self.client, self.commit ) if self.qualified_usage and self.client and self.commit: usages = [] revision = '{0}'.format(self.commit) for usage in self.qualified_usage: if not usage.commit and '@UNCOMMITTED' in usage._label: usages.append( Usage.from_revision( client=self.client, path=usage.path, role=usage.role, revision=revision, id=usage._id, ) ) else: if not usage.client: usage.entity.set_client(self.client) if not usage.commit: revision = usage._label.rsplit('@')[1] usage.entity.commit = self.client.repo.commit(revision) usages.append(usage) self.qualified_usage = usages def default_generated(self): """Create default ``generated``.""" generated = [] if not self.association or not self.association.plan: return generated for output in self.association.plan.outputs: entity = Entity.from_revision( self.client, output.produces.path, revision=self.commit, parent=output.produces.parent ) generation = Generation( activity=self, role=output.sanitized_id, entity=entity ) generated.append(generation) return generated def add_annotations(self, annotations): """Adds annotations from an external tool.""" self.annotations.extend(annotations) def plugin_annotations(self): """Adds ``Annotation``s from plugins to a ``ProcessRun``.""" from renku.core.plugins.pluginmanager import get_plugin_manager pm = get_plugin_manager() results = pm.hook.process_run_annotations(run=self) return [a for r in results for a in r] @classmethod def from_run( cls, run, client, path, commit=None, is_subprocess=False, update_commits=False ): """Convert a ``Run`` to a ``ProcessRun``.""" from .agents import SoftwareAgent if not commit: commit = client.repo.head.commit usages = [] id_ = ProcessRun.generate_id(commit) if is_subprocess: id_ = '{}/steps/step_{}'.format(id_, run.process_order) for input_ in run.inputs: usage_id = id_ + '/inputs/' + input_.sanitized_id revision = commit input_path = input_.consumes.path entity = input_.consumes if update_commits: revision = client.find_previous_commit( input_path, revision=commit.hexsha ) entity = Entity.from_revision(client, input_path, revision) dependency = Usage( entity=entity, role=input_.sanitized_id, id=usage_id ) usages.append(dependency) agent = SoftwareAgent.from_commit(commit) association = Association( agent=agent, id=id_ + '/association', plan=run ) process_run = cls( id=id_, qualified_usage=usages, association=association, client=client, commit=commit, path=path ) generated = [] for output in run.outputs: entity = Entity.from_revision( client, output.produces.path, revision=commit, parent=output.produces.parent ) generation = Generation( activity=process_run, role=output.sanitized_id, entity=entity ) generated.append(generation) process_run.generated = generated process_run.plugin_annotations() return process_run @property def parents(self): """Return parent commits.""" return [ member.commit for usage in self.qualified_usage for member in usage.entity.entities ] + super().parents @property def nodes(self): """Return topologically sorted nodes.""" # Outputs go first yield from super().nodes # Activity itself yield self.association.plan
class Person: """Represent a person.""" name = jsonld.ib(context='schema:name', kw_only=True, validator=instance_of(str)) email = jsonld.ib(context='schema:email', default=None, kw_only=True) label = jsonld.ib(context='rdfs:label', kw_only=True) affiliation = jsonld.ib(default=None, kw_only=True, context='schema:affiliation') alternate_name = jsonld.ib(default=None, kw_only=True, context='schema:alternateName') _id = jsonld.ib(context='@id', kw_only=True) @_id.default def default_id(self): """Set the default id.""" import string if self.email: return 'mailto:{email}'.format(email=self.email) # prep name to be a valid ntuple string name = self.name.translate(str.maketrans('', '', string.punctuation)) name = ''.join(filter(lambda x: x in string.printable, name)) return '_:{}'.format(''.join(name.lower().split())) @email.validator def check_email(self, attribute, value): """Check that the email is valid.""" if self.email and not (isinstance(value, str) and re.match(r'[^@]+@[^@]+\.[^@]+', value)): raise ValueError('Email address is invalid.') @label.default def default_label(self): """Set the default label.""" return self.name @classmethod def from_commit(cls, commit): """Create an instance from a Git commit.""" return cls( name=commit.author.name, email=commit.author.email, ) @property def short_name(self): """Gives full name in short form.""" names = self.name.split() if len(names) == 1: return self.name last_name = names[-1] initials = [name[0] for name in names] initials.pop() return '{0}.{1}'.format('.'.join(initials), last_name) @property def full_identity(self): """Return name, email, and affiliation.""" email = f' <{self.email}>' if self.email else '' affiliation = f' [{self.affiliation}]' if self.affiliation else '' return f'{self.name}{email}{affiliation}' @classmethod def from_git(cls, git): """Create an instance from a Git repo.""" git_config = git.config_reader() try: name = git_config.get_value('user', 'name', None) email = git_config.get_value('user', 'email', None) except (configparser.NoOptionError, configparser.NoSectionError): # pragma: no cover raise errors.ConfigurationError( 'The user name and email are not configured. ' 'Please use the "git config" command to configure them.\n\n' '\tgit config --global --add user.name "John Doe"\n' '\tgit config --global --add user.email ' '"*****@*****.**"\n') # Check the git configuration. if not name: # pragma: no cover raise errors.MissingUsername() if not email: # pragma: no cover raise errors.MissingEmail() return cls(name=name, email=email) @classmethod def from_string(cls, string): """Create an instance from a 'Name <email>' string.""" regex_pattern = r'([^<>\[\]]*)' \ r'(?:<{1}\s*(\S+@\S+\.\S+){0,1}\s*>{1}){0,1}\s*' \ r'(?:\[{1}(.*)\]{1}){0,1}' name, email, affiliation = re.search(regex_pattern, string).groups() if name: name = name.strip() if affiliation: affiliation = affiliation.strip() affiliation = affiliation or None return cls(name=name, email=email, affiliation=affiliation) @classmethod def from_dict(cls, obj): """Create and instance from a dictionary.""" return cls(**obj) @classmethod def from_jsonld(cls, data): """Create an instance from JSON-LD data.""" if isinstance(data, cls): return data if not isinstance(data, dict): raise ValueError(data) return PersonSchema().load(data) def __attrs_post_init__(self): """Finish object initialization.""" # handle the case where ids were improperly set if self._id == 'mailto:None' or self._id is None: self._id = self.default_id() if self.label is None: self.label = self.default_label()
class Project(object): """Represent a project.""" name = jsonld.ib(default=None, context='schema:name') created = jsonld.ib( converter=parse_date, context='schema:dateCreated', ) updated = jsonld.ib( converter=parse_date, context='schema:dateUpdated', ) version = jsonld.ib( converter=str, default='2', context='schema:schemaVersion', ) client = attr.ib(default=None, kw_only=True) creator = jsonld.ib(default=None, kw_only=True, context={ '@id': 'schema:creator', }, type=Person) _id = jsonld.ib(context='@id', kw_only=True, default=None) @created.default @updated.default def _now(self): """Define default value for datetime fields.""" return datetime.datetime.now(datetime.timezone.utc) def __attrs_post_init__(self): """Initialize computed attributes.""" if not self.creator and self.client: if self.client.renku_metadata_path.exists(): self.creator = Person.from_commit( self.client.find_previous_commit( self.client.renku_metadata_path, return_first=True), ) else: # this assumes the project is being newly created self.creator = Person.from_git(self.client.repo) self._id = self.project_id @property def project_id(self): """Return the id for the project based on the repo origin remote.""" import pathlib import urllib # Determine the hostname for the resource URIs. # If RENKU_DOMAIN is set, it overrides the host from remote. # Default is localhost. host = 'localhost' owner = self.creator.email.split('@')[0] if self.creator else 'NULL' name = self.name if self.client: remote = self.client.remote host = self.client.remote.get('host') or host owner = remote.get('owner') or owner name = remote.get('name') or name host = os.environ.get('RENKU_DOMAIN') or host if name: name = urllib.parse.quote(name, safe='') project_url = urllib.parse.urljoin( 'https://{host}'.format(host=host), pathlib.posixpath.join(PROJECT_URL_PATH, owner, name or 'NULL')) return project_url
class CommitMixin: """Represent a commit mixin.""" commit = attr.ib(default=None, kw_only=True) client = attr.ib(default=None, kw_only=True) path = jsonld.ib( context='prov:atLocation', default=None, kw_only=True, converter=_str_or_none ) _id = jsonld.ib(default=None, context='@id', kw_only=True) _label = jsonld.ib(context='rdfs:label', kw_only=True) _project = jsonld.ib( context='schema:isPartOf', type=Project, kw_only=True, default=None ) @property def submodules(self): """Proxy to client submodules.""" return self.client.submodules def default_id(self): """Configure calculated ID.""" if self.commit: hexsha = self.commit.hexsha else: hexsha = 'UNCOMMITTED' # Determine the hostname for the resource URIs. # If RENKU_DOMAIN is set, it overrides the host from remote. # Default is localhost. host = 'localhost' if self.client: host = self.client.remote.get('host') or host host = os.environ.get('RENKU_DOMAIN') or host # always set the id by the identifier return urllib.parse.urljoin( 'https://{host}'.format(host=host), pathlib.posixpath.join( '/blob/{hexsha}/{path}'.format(hexsha=hexsha, path=self.path) ) ) @_label.default def default_label(self): """Generate a default label.""" if self.commit: hexsha = self.commit.hexsha else: hexsha = 'UNCOMMITTED' if self.path: return '{self.path}@{hexsha}'.format(hexsha=hexsha, self=self) return '{hexsha}'.format(hexsha=hexsha, self=self) def __attrs_post_init__(self): """Post-init hook.""" if self.path: path = pathlib.Path(self.path) if path.is_absolute(): self.path = str(path.relative_to(self.client.path)) # always force "project" to be the current project if self.client: self._project = self.client.project if not self._id: self._id = self.default_id()
class ProcessRun(Activity): """A process run is a particular execution of a Process description.""" __association_cls__ = Process inputs = attr.ib(default=None, kw_only=True) outputs = attr.ib(default=None, kw_only=True) generated = jsonld.container.list(Generation, context={ '@reverse': 'prov:activity', }, kw_only=True, default=None) association = jsonld.ib(context='prov:qualifiedAssociation', default=None, kw_only=True, type=Association) annotations = jsonld.container.list(context={ '@reverse': 'oa:hasTarget', }, kw_only=True, type=Annotation) qualified_usage = jsonld.ib(default=None, context='prov:qualifiedUsage', kw_only=True, type=Usage) def __attrs_post_init__(self): """Calculate properties.""" super().__attrs_post_init__() if not self.inputs: self.inputs = self.default_inputs() if not self.qualified_usage: self.qualified_usage = self.default_qualified_usage() if self.association is None: self.association = Association.from_activity(self) if not self.annotations: if (hasattr(self.process, 'annotations') and self.process.annotations): self.annotations = self.process.annotations self.annotations.extend(self.plugin_annotations()) if self.path is None: # FIXME only works for linking directory to file existing_outputs = set(self.outputs.values()) for output_id, output_path in self.iter_output_files(): if output_id not in existing_outputs: self.outputs[os.path.join( next(path for path, usage in self.inputs.items() if usage.role == 'input_directory'), output_path)] = output_id break def plugin_annotations(self): """Adds ``Annotation``s from plugins to a ``ProcessRun``.""" from renku.core.plugins.pluginmanager import get_plugin_manager pm = get_plugin_manager() results = pm.hook.process_run_annotations(run=self) return [a for r in results for a in r] def default_inputs(self): """Guess default inputs from a process.""" inputs = {} basedir = os.path.dirname(self.path) commit = self.commit client = self.client process = self.process revision = '{0}^'.format(commit) for input_id, input_path in process.iter_input_files(basedir): try: usage_id = self._id + '/inputs/' + input_id dependency = Usage.from_revision( client=client, path=input_path, role=input_id, revision=revision, id=usage_id, ) inputs[input_path] = dependency except KeyError: continue return inputs def default_qualified_usage(self): """Generate list of used artifacts.""" return list(self.inputs.values()) def iter_output_files(self, commit=None): """Yield tuples with output id and path.""" process = self.process for output in process.outputs: if output.type in {'stdout', 'stderr'}: stream = getattr(process, output.type) if stream: yield output.id, stream elif output.type in PATH_OBJECTS: glob = output.outputBinding.glob # TODO better support for Expression if glob.startswith('$(inputs.'): input_id = glob[len('$(inputs.'):-1] for input_ in process.inputs: if input_.id == input_id: yield output.id, input_.default break # out from process.inputs else: yield output.id, glob def default_outputs(self): """Guess default outputs from a process.""" if self.path is None: return {} return { output_path: output_id for output_id, output_path in self.iter_output_files() } @property def parents(self): """Return parent commits.""" return [ member.commit for usage in self.qualified_usage for member in usage.entity.entities ] + super().parents @property def nodes(self): """Return topologically sorted nodes.""" # Outputs go first yield from super().nodes # Activity itself yield self.association.plan