Ejemplo n.º 1
0
    def test_ibnode(self):
        def sbs(l1, l2):
            for a, b in zip(l1, l2):
                print('', a[:5], a[-5:], '\n', b[:5], b[-5:], '\n\n')

        def ds(d1, d2):
            for (k1, v1), (k2, v2) in sorted(zip(sorted(d1.items()), sorted(d2.items()))):
                if k1 != k2:
                    # TODO len t1 != len t2
                    for t1, t2 in sorted(zip(sorted(v1), sorted(v2))):
                        print(tuple(e[:5] if type(e) == bytes else e for e in t1))
                        print(tuple(e[:5] if type(e) == bytes else e for e in t2))
                        print()

        id1 = IdentityBNode(self.graph1, debug=True)
        id2 = IdentityBNode(self.graph2, debug=True)

        idni1 = sorted(id1.named_identities) 
        idni2 = sorted(id2.named_identities) 
        assert idni1 == idni2, 'named identities do not match'

        idli1 = sorted(id1.connected_identities) 
        idli2 = sorted(id2.connected_identities) 
        assert idli1 == idli2, 'linked identities do not match'

        idfi1 = sorted(id1.free_identities) 
        idfi2 = sorted(id2.free_identities) 
        try:
            assert idfi1 == idfi2, 'free identities do not match'
        except AssertionError as e:
            _ = [[print(e[:10]) for e in t] and print() for t in zip(idfi1, idfi2)]
            lu1 = {v:k for k, v in id1.unnamed_subgraph_identities.items()}
            lu2 = {v:k for k, v in id2.unnamed_subgraph_identities.items()}
            s1 = set(id1.unnamed_subgraph_identities.values())
            s2 = set(id2.unnamed_subgraph_identities.values())
            diff = (s1 | s2) - (s1 & s2)
            for d in diff:
                if d in lu1:
                    s = lu1[d]
                    p, o = next(id1._thing[s])
                    print('id1 extra')
                    [print(t)
                     for t in sorted(yield_recursive(s, p, o, id1._thing),
                                     key=lambda t:t[::-1])]
                else:
                    s = lu2[d]
                    p, o = next(id2._thing[s])
                    print('id2 extra')
                    [print(t)
                     for t in sorted(yield_recursive(s, p, o, id2._thing),
                                     key=lambda t:t[::-1])]

            assert len(set(idfi1)) == len(idfi1), 'HRM 1'
            assert len(set(idfi2)) == len(idfi2), 'HRM 2'
            print(len(idfi1), len(idfi2))  # wow... terrifying that these don't match
            print(e)
            embed()
            raise e

        assert id1.identity == id2.identity, 'identities do not match'
Ejemplo n.º 2
0
 def _old(self):
     # the implementation below is a much slower equivalent that does needless checks
     # better simply to validate that there are no chebi ids that are missing an owl:Class
     #for id_ in sorted(set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))):
     print('more_ids', more_ids)
     for eid in sorted(ids | more_ids):
         #print(repr(eid))
         id_ = self._graph.qname(eid)
         s = rdflib.URIRef(eid)
         po = list(g.predicate_objects(s))
         if not po:
             print(s, 'not in xml')
             #looks for the id_ as a literal
             alts = list(
                 g.subjects(oboInOwl.hasAlternativeId,
                            rdflib.Literal(id_,
                                           datatype=rdflib.XSD.string)))
             if alts:
                 replaced_by = alts[0]
                 if replaced_by.toPython(
                 ) not in ids:  #  we need to add any replacment classes to the bridge
                     print('REPLACED BY NEW CLASS', id_)
                     for p, o in g.predicate_objects(replaced_by):
                         yield from yield_recursive(replaced_by, p, o, g)
                 chebi_dead.addTrip(s, rdf.type, owl.Class)
                 chebi_dead.addTrip(s, replacedBy, replaced_by)
                 chebi_dead.addTrip(s, owl.deprecated, Literal(True))
             else:
                 if self._graph.qname(eid) not in depwor:
                     raise BaseException('wtf error',
                                         self._graph.qname(eid))
         else:
             for p, o in po:
                 yield from yield_recursive(s, p, o, g)
                 if p == replacedBy:
                     chebi_dead.addTrip(s, rdfs.subClassOf,
                                        owl.DeprecatedClass)
                     oqname = self._graph.qname(o)
                     if (o, rdf.type, owl.Class) not in g:
                         print(
                             'WARNING: replaced but not in the xml subset',
                             o)
                     elif oqname not in ids and str(o) not in more_ids:
                         print(
                             'WARNING: replaced but not in ids or more_ids',
                             o)
                         for np, no in g[o]:
                             yield from yield_recursive(o, np, no, g)
                     for ro in g[o:rdfs.label]:
                         chebi_dead.addTrip(s, rdfs.label, ro)
Ejemplo n.º 3
0
    def _triples(self):
        (ids_raw, ids), (more, more_ids, g) = self.sources

        depwor = {
            'CHEBI:33243': 'natural product',  # FIXME remove these?
            'CHEBI:36809': 'tricyclic antidepressant',
        }
        chebiiri = next(g[:rdf.type:owl.Ontology])
        oiodate = rdflib.URIRef(
            str(oboInOwl) + 'date')  # this predicate doesn't actually exist...
        chebidate = next(g[chebiiri:oiodate])
        yield self.iri, oiodate, chebidate
        # wow prov is extremely heavy weight ...
        b0, b1, b2 = [rdflib.BNode() for _ in range(3)]
        e1, e2 = self.wasDerivedFrom
        yield self.iri, prov.qualifiedDerivation, b0
        yield b0, rdf.type, prov.Derivation
        yield b0, prov.entity, e1
        yield b0, prov.hadActivity, b1
        yield b1, rdf.type, prov.Activity
        yield b1, prov.startedAtTime, rdflib.Literal(self.start_time)
        yield b1, prov.used, rdflib.URIRef(self.wasGeneratedBy)
        #yield b1, prov.generated, self.versionIRI
        # the fact that it isn't really possible to include a versionIRI
        # reveals that trying to include bound prov data rapidly encounters
        # significant issues
        yield b1, prov.wasAssociatedWith, b2
        yield b2, rdf.type, prov.SoftwareAgent
        yield b2, ilxtr.implementationOf, ilxtr['ProgrammingLanguage/Python']
        yield b2, ilxtr.versionString, rdflib.Literal(
            sys.version.replace('\n', ' '))
        # NOTE: b1 doesn't quite work if we want endedAtTime
        # because we don't actually know when the process will
        # end IF we consider the end of the process to be the
        # actual serialization of the file, in which case the
        # prov has to be external to the file itself, or the
        # last serialization cannot have bound provenance
        # or some real identifier has to be used to allow
        # a reference to the activity so that an end time can
        # be logged in another system

        yield from ((ss, ps, os) for s in g[:rdf.type:owl.ObjectProperty]
                    if str(s) not in (str(hasRole), str(hasPart))
                    for p, o in g[s]
                    for ss, ps, os in yield_recursive(s, p, o, g))

        for s, p, o in g:
            if s == chebiiri or s == rdflib.URIRef(
                    hasPart) or s == rdflib.URIRef(hasRole):
                continue
            if p == replacedBy:
                chebi_dead.addTrip(s, rdfs.subClassOf, owl.DeprecatedClass)
                for dead_predicate in (rdfs.label, oboInOwl.hasExactSynonym):
                    for dead_object in g[o:dead_predicate]:
                        chebi_dead.addTrip(s, dead_predicate, dead_object)

            yield s, p, o
Ejemplo n.º 4
0
    def _triples(self):
        (ids_raw, ids), (more, more_ids, g) = self.sources

        depwor = {
            'CHEBI:33243': 'natural product',  # FIXME remove these?
            'CHEBI:36809': 'tricyclic antidepressant',
        }
        chebiiri = next(g[:rdf.type:owl.Ontology])
        oiodate = rdflib.URIRef(
            str(oboInOwl) + 'date')  # this predicate doesn't actually exist...
        chebidate = next(g[chebiiri:oiodate])
        b0 = rdflib.BNode()
        yield self.iri, prov.qualifiedDerivation, b0
        yield b0, rdf.type, prov.Derivation
        yield b0, prov.atTime, chebidate

        yield from ((ss, ps, os) for s in g[:rdf.type:owl.ObjectProperty]
                    if str(s) not in (str(hasRole), str(hasPart))
                    for p, o in g[s]
                    for ss, ps, os in yield_recursive(s, p, o, g))

        for s, p, o in g:
            if s == chebiiri or s == rdflib.URIRef(
                    hasPart) or s == rdflib.URIRef(hasRole):
                continue
            if p == replacedBy:
                chebi_dead.addTrip(s, rdfs.subClassOf, owl.DeprecatedClass)
                for dead_predicate in (rdfs.label, oboInOwl.hasExactSynonym):
                    for dead_object in g[o:dead_predicate]:
                        chebi_dead.addTrip(s, dead_predicate, dead_object)

            yield s, p, o

        return  # the implementation below is a much slower equivalent that does needless checks
        # better simply to validate that there are no chebi ids that are missing an owl:Class

        #for id_ in sorted(set(ids_raw) | set((ug.g.namespace_manager.qname(_) for _ in mids))):
        print('more_ids', more_ids)
        for eid in sorted(ids | more_ids):
            #print(repr(eid))
            id_ = self._graph.qname(eid)
            s = rdflib.URIRef(eid)
            po = list(g.predicate_objects(s))
            if not po:
                print(s, 'not in xml')
                #looks for the id_ as a literal
                alts = list(
                    g.subjects(oboInOwl.hasAlternativeId,
                               rdflib.Literal(id_,
                                              datatype=rdflib.XSD.string)))
                if alts:
                    replaced_by = alts[0]
                    if replaced_by.toPython(
                    ) not in ids:  #  we need to add any replacment classes to the bridge
                        print('REPLACED BY NEW CLASS', id_)
                        for p, o in g.predicate_objects(replaced_by):
                            yield from yield_recursive(replaced_by, p, o, g)
                    chebi_dead.addTrip(s, rdf.type, owl.Class)
                    chebi_dead.addTrip(s, replacedBy, replaced_by)
                    chebi_dead.addTrip(s, owl.deprecated, Literal(True))
                else:
                    if self._graph.qname(eid) not in depwor:
                        raise BaseException('wtf error',
                                            self._graph.qname(eid))
            else:
                for p, o in po:
                    yield from yield_recursive(s, p, o, g)
                    if p == replacedBy:
                        chebi_dead.addTrip(s, rdfs.subClassOf,
                                           owl.DeprecatedClass)
                        oqname = self._graph.qname(o)
                        if (o, rdf.type, owl.Class) not in g:
                            print(
                                'WARNING: replaced but not in the xml subset',
                                o)
                        elif oqname not in ids and str(o) not in more_ids:
                            print(
                                'WARNING: replaced but not in ids or more_ids',
                                o)
                            for np, no in g[o]:
                                yield from yield_recursive(o, np, no, g)
                        for ro in g[o:rdfs.label]:
                            chebi_dead.addTrip(s, rdfs.label, ro)