Example #1
0
    def _applyUnknownGraphs(self, delta, known_blobs):
        new_contexts = {}
        for entry in delta:
            for identifier, changeset in entry['delta'].items():
                if isinstance(identifier,
                              BNode) or str(identifier) == 'default':
                    continue  # TODO default graph use case

                if identifier not in new_contexts.keys():
                    fileName = iri_to_name(identifier) + '.nt'

                    if fileName in known_blobs:
                        reg = re.compile(
                            re.escape(iri_to_name(identifier)) +
                            "_([0-9]+).nt")
                        #  n ~ numbers (in blobname), b ~ blobname, m ~ match
                        n = [
                            int(m.group(1)) for b in known_blobs
                            for m in [reg.search(b)] if m
                        ] + [0]
                        fileName = '{}_{}.nt'.format(iri_to_name(identifier),
                                                     max(n) + 1)

                    new_contexts[identifier] = FileReference(fileName, '')

                fileReference = new_contexts[identifier]
                applyChangeset(fileReference, changeset, identifier)
        return new_contexts
Example #2
0
    def getFileReferenceAndContext(self, blob, commit):
        """Get the FileReference and Context for a given blob (name, oid) of a commit.

        On Cache miss this method also updates teh commits cache.
        """
        if commit.id not in self._graphconfigs:
            self.updateGraphConfig(commit.id)

        if blob not in self._blobs:
            (name, oid) = blob
            content = commit.node(path=name).content
            graphUri = self._graphconfigs.get(commit.id).getgraphuriforfile(name)
            graph = Graph(identifier=URIRef(graphUri))
            graph.parse(data=content, format='nt')
            quitWorkingData = (FileReference(name, content), graph)
            self._blobs.set(blob, quitWorkingData)
            return quitWorkingData
        return self._blobs.get(blob)
Example #3
0
    def getFileReferenceAndContext(self, blob, commit):
        """Get the FielReference and Context for a given blob (name, oid) of a commit.

        On Cache miss this method also updates teh commits cache.
        """
        uriFileMap = self.config.getgraphurifilemap()

        if blob not in self._blobs:
            (name, oid) = blob
            content = commit.node(path=name).content
            # content = self.repository._repository[oid].data
            graphUris = self.config.getgraphuriforfile(name)
            graphsFromConfig = set((Graph(identifier=i) for i in graphUris))
            tmp = ConjunctiveGraph()
            tmp.parse(data=content, format='nquads')
            contexts = set(
                (context for context in tmp.contexts(None)
                 if context.identifier in uriFileMap)) | graphsFromConfig
            quitWorkingData = (FileReference(name, content), contexts)
            self._blobs.set(blob, quitWorkingData)
            return quitWorkingData
        return self._blobs.get(blob)
Example #4
0
    def changeset(self, commit):

        if (not self.config.hasFeature(Feature.Persistence)) and (
                not self.config.hasFeature(Feature.Provenance)):
            return

        g = self.store.store

        if self.config.hasFeature(Feature.Provenance):
            role_author_uri = QUIT['Author']
            role_committer_uri = QUIT['Committer']

            g.add((role_author_uri, is_a, PROV['Role']))
            g.add((role_committer_uri, is_a, PROV['Role']))

        # Create the commit
        i1, commitid = self.instance(commit.id, True)

        commit_uri = QUIT['commit-' + commit.id]

        if self.config.hasFeature(Feature.Provenance):
            g.add((commit_uri, is_a, PROV['Activity']))

            if 'Source' in commit.properties.keys():
                g.add((commit_uri, is_a, QUIT['Import']))
                sources = commit.properties['Source'].strip()
                for source in re.findall("<.*?>", sources):
                    g.add((commit_uri, QUIT['dataSource'],
                           URIRef(source.strip("<>"))))
            if 'Query' in commit.properties.keys():
                g.add((commit_uri, is_a, QUIT['Transformation']))
                g.add((commit_uri, QUIT['query'],
                       Literal(commit.properties['Query'].strip())))

            g.add((commit_uri, QUIT['hex'], Literal(commit.id)))
            g.add((commit_uri, PROV['startedAtTime'],
                   Literal(git_timestamp(commit.author.time,
                                         commit.author.offset),
                           datatype=XSD.dateTime)))
            g.add((commit_uri, PROV['endedAtTime'],
                   Literal(git_timestamp(commit.committer.time,
                                         commit.committer.offset),
                           datatype=XSD.dateTime)))
            g.add((commit_uri, RDFS['label'], Literal(commit.message.strip())))

            # Author
            hash = pygit2.hash(commit.author.email).hex
            author_uri = QUIT['user-' + hash]
            g.add((commit_uri, PROV['wasAssociatedWith'], author_uri))

            g.add((author_uri, is_a, PROV['Agent']))
            g.add((author_uri, RDFS.label, Literal(commit.author.name)))
            g.add((author_uri, FOAF.mbox, Literal(commit.author.email)))

            q_author_uri = BNode()
            g.add((commit_uri, PROV['qualifiedAssociation'], q_author_uri))
            g.add((q_author_uri, is_a, PROV['Association']))
            g.add((q_author_uri, PROV['agent'], author_uri))
            g.add((q_author_uri, PROV['role'], role_author_uri))

            if commit.author.name != commit.committer.name:
                # Committer
                hash = pygit2.hash(commit.committer.email).hex
                committer_uri = QUIT['user-' + hash]
                g.add((commit_uri, PROV['wasAssociatedWith'], committer_uri))

                g.add((committer_uri, is_a, PROV['Agent']))
                g.add((committer_uri, RDFS.label,
                       Literal(commit.committer.name)))
                g.add((committer_uri, FOAF.mbox,
                       Literal(commit.committer.email)))

                q_committer_uri = BNode()
                g.add((commit_uri, PROV['qualifiedAssociation'],
                       q_committer_uri))
                g.add((q_committer_uri, is_a, PROV['Association']))
                g.add((q_committer_uri, PROV['agent'], author_uri))
                g.add((q_committer_uri, PROV['hadRole'], role_committer_uri))
            else:
                g.add((q_author_uri, PROV['hadRole'], role_committer_uri))

            # Parents
            for parent in iter(commit.parents or []):
                parent_uri = QUIT['commit-' + parent.id]
                g.add((commit_uri, QUIT["preceedingCommit"], parent_uri))
                g.add((commit_uri, PROV["wasInformedBy"], parent_uri))

            # Diff
            parent = next(iter(commit.parents or []), None)

            i2, commitid = self.instance(parent.id, True) if parent else (None,
                                                                          None)

            delta = graphdiff(i2.store if i2 else None, i1.store)

            for index, (iri, changesets) in enumerate(delta.items()):
                update_uri = QUIT['update-{}-{}'.format(commit.id, index)]
                g.add((update_uri, QUIT['graph'], iri))
                g.add((commit_uri, QUIT['updates'], update_uri))
                for (op, triples) in changesets:
                    op_uri = QUIT[op + '-' + commit.id]
                    g.add((update_uri, QUIT[op], op_uri))
                    g.addN((s, p, o, op_uri) for s, p, o in triples)

        # Entities
        if commit.id not in self._graphconfigs:
            self.updateGraphConfig(commit.id)

        map = self._graphconfigs.get(commit.id).getgraphurifilemap()

        for entity in commit.node().entries(recursive=True):
            # todo check if file was changed
            if entity.is_file:

                if entity.name not in map.values():
                    continue

                graphUri = self._graphconfigs.get(
                    commit.id).getgraphuriforfile(entity.name)
                blob = (entity.name, entity.oid)

                try:
                    f, context = self.getFileReferenceAndContext(blob, commit)
                except KeyError:
                    graph = Graph(identifier=graphUri)
                    graph.parse(data=entity.content, format='nt')

                    self._blobs.set(
                        blob,
                        (FileReference(entity.name, entity.content), graph))

                private_uri = QUIT["graph-{}".format(entity.oid)]

                if (self.config.hasFeature(Feature.Provenance)
                        or self.config.hasFeature(Feature.Persistence)):
                    g.add((private_uri, is_a, PROV['Entity']))
                    g.add((private_uri, PROV['specializationOf'],
                           context.identifier))
                    g.add((private_uri, PROV['wasGeneratedBy'], commit_uri))

                    q_usage = BNode()
                    g.add((private_uri, PROV['qualifiedGeneration'], q_usage))
                    g.add((q_usage, is_a, PROV['Generation']))
                    g.add((q_usage, PROV['activity'], commit_uri))

                    prev = next(entity.history(), None)
                    if prev:
                        prev_uri = QUIT["graph-{}-{}".format(prev.oid, index)]
                        g.add((private_uri, PROV['wasDerivedFrom'], prev_uri))
                        g.add((commit_uri, PROV['used'], prev_uri))

                        q_derivation = BNode()
                        g.add((private_uri, PROV['qualifiedDerivation'],
                               q_derivation))
                        g.add((q_derivation, is_a, PROV['Derivation']))
                        g.add((q_derivation, PROV['entity'], prev_uri))
                        g.add((q_derivation, PROV['hadActivity'], commit_uri))
                if self.config.hasFeature(Feature.Persistence):
                    g.addN((s, p, o, private_uri)
                           for s, p, o in context.triples((None, None, None)))
Example #5
0
    def commit(self, graph, delta, message, commit_id, ref, **kwargs):
        def build_message(message, kwargs):
            out = list()
            for k, v in kwargs.items():
                out.append('{}: "{}"'.format(k, v.replace('"', "\\\"")))
            if message:
                out.append('')
                out.append(message)
            return "\n".join(out)

        def _apply(f, changeset, identifier):
            """Update the FileReference (graph uri) of a file with help of the changeset."""
            for (op, triples) in changeset:
                if op == 'additions':
                    for triple in triples:
                        # the internal _nq serializer appends '\n'
                        line = _nq(triple, identifier).rstrip()
                        f.add(line)
                elif op == 'removals':
                    for triple in triples:
                        # the internal _nq serializer appends '\n'
                        line = _nq(triple, identifier).rstrip()
                        f.remove(line)

        if not delta:
            return

        commit = self.repository.revision(commit_id)
        index = self.repository.index(commit.id)

        blobs_new = set()
        try:
            blobs = self.getFilesForCommit(commit)
        except KeyError:
            blobs = []

        for blob in blobs:
            (fileName, oid) = blob
            try:
                file_reference, contexts = self.getFileReferenceAndContext(
                    blob, commit)
                for context in contexts:
                    for entry in delta:
                        changeset = entry.get(context.identifier, None)

                        if changeset:
                            _apply(file_reference, changeset,
                                   context.identifier)
                            del (entry[context.identifier])

                index.add(file_reference.path, file_reference.content)

                self._blobs.remove(blob)
                blob = fileName, index.stash[file_reference.path][0]
                self._blobs.set(blob, (file_reference, contexts))
                blobs_new.add(blob)
            except KeyError:
                pass

        unassigned = set()
        f_name = self.config.getGlobalFile() or 'unassigned.nq'
        f_new = FileReference(f_name, "")
        for entry in delta:
            for identifier, changeset in entry.items():
                unassigned.add(graph.store.get_context(identifier))
                _apply(f_new, changeset, graph.store.identifier)

                index.add(f_new.path, f_new.content)

                blob = f_name, index.stash[f_new.path][0]
                self._blobs.set(blob, (f_new, unassigned))
                blobs_new.add(blob)

        message = build_message(message, kwargs)
        author = self.repository._repository.default_signature

        oid = index.commit(message, author.name, author.email, ref=ref)

        if self.config.hasFeature(Feature.GarbageCollection):
            self.garbagecollection()

        if oid:
            self._commits.set(oid.hex, blobs_new)
            commit = self.repository.revision(oid.hex)
            if not self.repository.is_bare:
                self.repository._repository.checkout(
                    ref, strategy=pygit2.GIT_CHECKOUT_FORCE)
            self.syncSingle(commit, delta)