def _applyUnknownGraphs(self, delta, known_blobs): new_contexts = {} for entry in delta: for identifier, changeset in entry['delta'].items(): if isinstance(identifier, BNode) or str(identifier) == 'default': continue # TODO default graph use case if identifier not in new_contexts.keys(): fileName = iri_to_name(identifier) + '.nt' if fileName in known_blobs: reg = re.compile( re.escape(iri_to_name(identifier)) + "_([0-9]+).nt") # n ~ numbers (in blobname), b ~ blobname, m ~ match n = [ int(m.group(1)) for b in known_blobs for m in [reg.search(b)] if m ] + [0] fileName = '{}_{}.nt'.format(iri_to_name(identifier), max(n) + 1) new_contexts[identifier] = FileReference(fileName, '') fileReference = new_contexts[identifier] applyChangeset(fileReference, changeset, identifier) return new_contexts
def getFileReferenceAndContext(self, blob, commit): """Get the FileReference and Context for a given blob (name, oid) of a commit. On Cache miss this method also updates teh commits cache. """ if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) if blob not in self._blobs: (name, oid) = blob content = commit.node(path=name).content graphUri = self._graphconfigs.get(commit.id).getgraphuriforfile(name) graph = Graph(identifier=URIRef(graphUri)) graph.parse(data=content, format='nt') quitWorkingData = (FileReference(name, content), graph) self._blobs.set(blob, quitWorkingData) return quitWorkingData return self._blobs.get(blob)
def getFileReferenceAndContext(self, blob, commit): """Get the FielReference and Context for a given blob (name, oid) of a commit. On Cache miss this method also updates teh commits cache. """ uriFileMap = self.config.getgraphurifilemap() if blob not in self._blobs: (name, oid) = blob content = commit.node(path=name).content # content = self.repository._repository[oid].data graphUris = self.config.getgraphuriforfile(name) graphsFromConfig = set((Graph(identifier=i) for i in graphUris)) tmp = ConjunctiveGraph() tmp.parse(data=content, format='nquads') contexts = set( (context for context in tmp.contexts(None) if context.identifier in uriFileMap)) | graphsFromConfig quitWorkingData = (FileReference(name, content), contexts) self._blobs.set(blob, quitWorkingData) return quitWorkingData return self._blobs.get(blob)
def changeset(self, commit): if (not self.config.hasFeature(Feature.Persistence)) and ( not self.config.hasFeature(Feature.Provenance)): return g = self.store.store if self.config.hasFeature(Feature.Provenance): role_author_uri = QUIT['Author'] role_committer_uri = QUIT['Committer'] g.add((role_author_uri, is_a, PROV['Role'])) g.add((role_committer_uri, is_a, PROV['Role'])) # Create the commit i1, commitid = self.instance(commit.id, True) commit_uri = QUIT['commit-' + commit.id] if self.config.hasFeature(Feature.Provenance): g.add((commit_uri, is_a, PROV['Activity'])) if 'Source' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Import'])) sources = commit.properties['Source'].strip() for source in re.findall("<.*?>", sources): g.add((commit_uri, QUIT['dataSource'], URIRef(source.strip("<>")))) if 'Query' in commit.properties.keys(): g.add((commit_uri, is_a, QUIT['Transformation'])) g.add((commit_uri, QUIT['query'], Literal(commit.properties['Query'].strip()))) g.add((commit_uri, QUIT['hex'], Literal(commit.id))) g.add((commit_uri, PROV['startedAtTime'], Literal(git_timestamp(commit.author.time, commit.author.offset), datatype=XSD.dateTime))) g.add((commit_uri, PROV['endedAtTime'], Literal(git_timestamp(commit.committer.time, commit.committer.offset), datatype=XSD.dateTime))) g.add((commit_uri, RDFS['label'], Literal(commit.message.strip()))) # Author hash = pygit2.hash(commit.author.email).hex author_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], author_uri)) g.add((author_uri, is_a, PROV['Agent'])) g.add((author_uri, RDFS.label, Literal(commit.author.name))) g.add((author_uri, FOAF.mbox, Literal(commit.author.email))) q_author_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_author_uri)) g.add((q_author_uri, is_a, PROV['Association'])) g.add((q_author_uri, PROV['agent'], author_uri)) g.add((q_author_uri, PROV['role'], role_author_uri)) if commit.author.name != commit.committer.name: # Committer hash = pygit2.hash(commit.committer.email).hex committer_uri = QUIT['user-' + hash] g.add((commit_uri, PROV['wasAssociatedWith'], committer_uri)) g.add((committer_uri, is_a, PROV['Agent'])) g.add((committer_uri, RDFS.label, Literal(commit.committer.name))) g.add((committer_uri, FOAF.mbox, Literal(commit.committer.email))) q_committer_uri = BNode() g.add((commit_uri, PROV['qualifiedAssociation'], q_committer_uri)) g.add((q_committer_uri, is_a, PROV['Association'])) g.add((q_committer_uri, PROV['agent'], author_uri)) g.add((q_committer_uri, PROV['hadRole'], role_committer_uri)) else: g.add((q_author_uri, PROV['hadRole'], role_committer_uri)) # Parents for parent in iter(commit.parents or []): parent_uri = QUIT['commit-' + parent.id] g.add((commit_uri, QUIT["preceedingCommit"], parent_uri)) g.add((commit_uri, PROV["wasInformedBy"], parent_uri)) # Diff parent = next(iter(commit.parents or []), None) i2, commitid = self.instance(parent.id, True) if parent else (None, None) delta = graphdiff(i2.store if i2 else None, i1.store) for index, (iri, changesets) in enumerate(delta.items()): update_uri = QUIT['update-{}-{}'.format(commit.id, index)] g.add((update_uri, QUIT['graph'], iri)) g.add((commit_uri, QUIT['updates'], update_uri)) for (op, triples) in changesets: op_uri = QUIT[op + '-' + commit.id] g.add((update_uri, QUIT[op], op_uri)) g.addN((s, p, o, op_uri) for s, p, o in triples) # Entities if commit.id not in self._graphconfigs: self.updateGraphConfig(commit.id) map = self._graphconfigs.get(commit.id).getgraphurifilemap() for entity in commit.node().entries(recursive=True): # todo check if file was changed if entity.is_file: if entity.name not in map.values(): continue graphUri = self._graphconfigs.get( commit.id).getgraphuriforfile(entity.name) blob = (entity.name, entity.oid) try: f, context = self.getFileReferenceAndContext(blob, commit) except KeyError: graph = Graph(identifier=graphUri) graph.parse(data=entity.content, format='nt') self._blobs.set( blob, (FileReference(entity.name, entity.content), graph)) private_uri = QUIT["graph-{}".format(entity.oid)] if (self.config.hasFeature(Feature.Provenance) or self.config.hasFeature(Feature.Persistence)): g.add((private_uri, is_a, PROV['Entity'])) g.add((private_uri, PROV['specializationOf'], context.identifier)) g.add((private_uri, PROV['wasGeneratedBy'], commit_uri)) q_usage = BNode() g.add((private_uri, PROV['qualifiedGeneration'], q_usage)) g.add((q_usage, is_a, PROV['Generation'])) g.add((q_usage, PROV['activity'], commit_uri)) prev = next(entity.history(), None) if prev: prev_uri = QUIT["graph-{}-{}".format(prev.oid, index)] g.add((private_uri, PROV['wasDerivedFrom'], prev_uri)) g.add((commit_uri, PROV['used'], prev_uri)) q_derivation = BNode() g.add((private_uri, PROV['qualifiedDerivation'], q_derivation)) g.add((q_derivation, is_a, PROV['Derivation'])) g.add((q_derivation, PROV['entity'], prev_uri)) g.add((q_derivation, PROV['hadActivity'], commit_uri)) if self.config.hasFeature(Feature.Persistence): g.addN((s, p, o, private_uri) for s, p, o in context.triples((None, None, None)))
def commit(self, graph, delta, message, commit_id, ref, **kwargs): def build_message(message, kwargs): out = list() for k, v in kwargs.items(): out.append('{}: "{}"'.format(k, v.replace('"', "\\\""))) if message: out.append('') out.append(message) return "\n".join(out) def _apply(f, changeset, identifier): """Update the FileReference (graph uri) of a file with help of the changeset.""" for (op, triples) in changeset: if op == 'additions': for triple in triples: # the internal _nq serializer appends '\n' line = _nq(triple, identifier).rstrip() f.add(line) elif op == 'removals': for triple in triples: # the internal _nq serializer appends '\n' line = _nq(triple, identifier).rstrip() f.remove(line) if not delta: return commit = self.repository.revision(commit_id) index = self.repository.index(commit.id) blobs_new = set() try: blobs = self.getFilesForCommit(commit) except KeyError: blobs = [] for blob in blobs: (fileName, oid) = blob try: file_reference, contexts = self.getFileReferenceAndContext( blob, commit) for context in contexts: for entry in delta: changeset = entry.get(context.identifier, None) if changeset: _apply(file_reference, changeset, context.identifier) del (entry[context.identifier]) index.add(file_reference.path, file_reference.content) self._blobs.remove(blob) blob = fileName, index.stash[file_reference.path][0] self._blobs.set(blob, (file_reference, contexts)) blobs_new.add(blob) except KeyError: pass unassigned = set() f_name = self.config.getGlobalFile() or 'unassigned.nq' f_new = FileReference(f_name, "") for entry in delta: for identifier, changeset in entry.items(): unassigned.add(graph.store.get_context(identifier)) _apply(f_new, changeset, graph.store.identifier) index.add(f_new.path, f_new.content) blob = f_name, index.stash[f_new.path][0] self._blobs.set(blob, (f_new, unassigned)) blobs_new.add(blob) message = build_message(message, kwargs) author = self.repository._repository.default_signature oid = index.commit(message, author.name, author.email, ref=ref) if self.config.hasFeature(Feature.GarbageCollection): self.garbagecollection() if oid: self._commits.set(oid.hex, blobs_new) commit = self.repository.revision(oid.hex) if not self.repository.is_bare: self.repository._repository.checkout( ref, strategy=pygit2.GIT_CHECKOUT_FORCE) self.syncSingle(commit, delta)