コード例 #1
0
def verify_rdf(rdf_output):
    g = ConjunctiveGraph()
    g.parse(data=rdf_output, format="turtle")
    assert len(g) == 6
    assert len(set(g.subjects())) == 2
    assert len(set(g.predicates())) == 3
    assert len(set(g.objects())) == 6
コード例 #2
0
def test_null_values_with_multiple_strings():
    csvw = CSVW(csv_path="tests/null1.csv",
                metadata_path="tests/null1.multiple.csv-metadata.json")
    rdf_contents = csvw.to_rdf()
    g = ConjunctiveGraph()
    g.parse(data=rdf_contents, format="turtle")

    all_objects = {x for x in g.objects()}

    assert Literal('null_key', datatype=XSD.token) not in all_objects
    assert Literal('null_sector') not in all_objects
    assert Literal('null_id', datatype=XSD.token) not in all_objects
    for id in ['10', '11', '12', '13']:
        assert Literal(id, datatype=XSD.token) not in all_objects

    all_preds = {x for x in g.predicates()}
    assert id_uri not in all_preds

    assert Literal('1', datatype=XSD.token) not in all_objects
コード例 #3
0
ファイル: swap_primer.py プロジェクト: mobilemadman2/rdflib-1
    # or:
    primer.add((myNS['pat'], myNS['age'], Literal(24)))

    # Now, with just that, lets see how the system
    # recorded *way* too many details about what
    # you just asserted as fact.
    #

    from pprint import pprint
    pprint(list(primer))

    # just think .whatever((s, p, o))
    # here we report on what we know

    pprint(list(primer.subjects()))
    pprint(list(primer.predicates()))
    pprint(list(primer.objects()))

    # and other things that make sense

    # what do we know about pat?
    pprint(list(primer.predicate_objects(myNS.pat)))

    # who is what age?
    pprint(list(primer.subject_objects(myNS.age)))

    # Okay, so lets now work with a bigger
    # dataset from the example, and start
    # with a fresh new graph.

    primer = ConjunctiveGraph()
コード例 #4
0
class PreProcessor(object):
    def __init__(self, kg_path):
        self.kg_path = kg_path
        self.ent_dict = dict()
        self.rel_dict = dict()
        self.g = ConjunctiveGraph()
        self.unique_msgs = self.ent_dict.copy()

    def load_knowledge_graph(self,
                             format='xml',
                             exclude_rels=[],
                             clean_schema=True,
                             amberg_params=None,
                             excluded_entities=None):
        self.g.load(self.kg_path, format=format)
        # remove triples with excluded relation
        remove_rel_triples(self.g, exclude_rels)
        # remove triples with relations between class-level constructs
        if clean_schema:
            remove_rel_triples(self.g, schema_relations)
        if excluded_entities is not None:
            remove_ent_triples(self.g, excluded_entities)
        if amberg_params:
            path_to_events = amberg_params[0]
            max_events = amberg_params[1]
            self.merged = get_merged_dataframe(path_to_events, max_events)
            self.unique_msgs, unique_vars, unique_mods, unique_fes = get_unique_entities(
                self.merged)
            update_amberg_ontology(self.g, self.ent_dict, self.unique_msgs,
                                   unique_mods, unique_fes, unique_vars,
                                   self.merged)

        self.update_entity_relation_dictionaries()

    def update_entity_relation_dictionaries(self):
        """
        Given an existing entity dictionary, update it to *ontology*
        :param ontology:
        :param ent_dict: the existing entity dictionary
        :return:
        """
        ent_counter = 0
        fixed_ids = set([id for id in self.ent_dict.values()])
        # sorting ensures equal random splits on equal seeds
        for h in sorted(
                set(self.g.subjects(None, None)).union(
                    set(self.g.objects(None, None)))):
            uni_h = unicode(h)
            if uni_h not in self.ent_dict:
                while ent_counter in fixed_ids:
                    ent_counter += 1
                self.ent_dict.setdefault(uni_h, ent_counter)
                ent_counter += 1
        # add new relations to dict
        for r in sorted(set(self.g.predicates(None, None))):
            uni_r = unicode(r)
            if uni_r not in self.rel_dict:
                self.rel_dict.setdefault(uni_r, len(self.rel_dict))

    def load_unique_msgs_from_txt(self, path, max_events=None):
        """
        Assuming csv text files with two columns
        :param path:
        :return:
        """
        with open(path, "rb") as f:
            for line in f:
                split = line.split(',')
                try:
                    emb_id = int(split[1].strip())
                except:
                    print("Error reading id of {0} in given dictionary".format(
                        line))
                    # skip this event entitiy, treat it as common entitiy later on
                    continue
                self.ent_dict[split[0]] = emb_id
        # sort ascending w.r.t. embedding id, in case of later stripping
        # self.ent_dict = sorted(self.ent_dict.items(), key=operator.itemgetter(1), reverse=False)
        self.unique_msgs = self.ent_dict.copy()
        if max_events is not None:
            all_msgs = sorted(self.unique_msgs.items(),
                              key=operator.itemgetter(1),
                              reverse=False)
            self.unique_msgs = dict(all_msgs[:max_events])
            excluded_events = dict(all_msgs[max_events:]).keys()
            return excluded_events

    def prepare_sequences(self, path_to_input, use_dict=True):
        """
        Dumps pickle for sequences and dictionary
        :param data_frame:
        :param file_name:
        :param index:
        :param classification_event:
        :return:
        """
        print("Preparing sequential data...")
        with open(path_to_input, "rb") as f:
            result = []
            for line in f:
                entities = line.split(',')
                if use_dict:
                    result.append([
                        int(e.strip()) for e in entities
                        if int(e.strip()) in self.unique_msgs.values()
                    ])
                else:
                    result.append([int(e.strip()) for e in entities])
        print("Processed {0} sequences".format(len(result)))
        return result

    def get_vocab_size(self):
        return len(self.unique_msgs)

    def get_ent_dict(self):
        return self.ent_dict

    def get_rel_dict(self):
        return self.rel_dict

    def get_kg(self):
        return self.g

    def get_unique_msgs(self):
        return self.unique_msgs

    def get_merged(self):
        return self.merged
コード例 #5
0
def parse_workflow():
    # FIXME TODO these states should probably be compiled down to numbers???
    docs = Path(__file__).parent.absolute().resolve().parent / 'docs'
    rridpath = docs / 'workflow-rrid.graphml'
    paperpath = docs / 'workflow-paper-id.graphml'

    cgraph = ConjunctiveGraph()
    gt.WorkflowMapping(rridpath.as_posix()).graph(cgraph)
    gt.PaperIdMapping(paperpath.as_posix(), False).graph(cgraph)
    write(cgraph, '/tmp/workflow.ttl')
    predicates = set(cgraph.predicates())
    OntCuries({cp:str(ip) for cp, ip in cgraph.namespaces()})
    OntCuries({'RRID': 'https://scicrunch.org/resolver/RRID:',
               'DOI': 'https://doi.org/',
               'PMID': 'https://www.ncbi.nlm.nih.gov/pubmed/'})
    hg = makeGraph('', graph=cgraph)
    short = sorted(hg.qname(_) for _ in predicates)

    wf.hasTag
    wf.hasReplyTag
    wf.hasTagOrReplyTag
    wf.hasOutputTag

    #if type isa wf.tag

    tag_types = set(cgraph.transitive_subjects(rdfs.subClassOf, wf.tag))
    tag_tokens = {tagType:sorted(set(t for t in cgraph.transitive_subjects(rdf.type, tagType)
                                     if t != tagType))
                  for tagType in tag_types}
    has_tag_types = set(cgraph.transitive_subjects(rdfs.subPropertyOf, wf.hasTagOrReplyTag))
    has_tag_types.add(wf.hasOutputTag)
    has_next_action_types = set(cgraph.transitive_subjects(rdfs.subPropertyOf, wf.hasOutput))
    has_next_action_types.add(wf.hasNextStep)

    terminals = sorted(tag
                       for ttype in tag_types
                       if ttype != wf.tagScibot  # scibot is not 'terminal' for this part
                       for tag in cgraph[:rdf.type:ttype]
                       if not isinstance(tag, BNode)
                       and not any(o for httype in has_tag_types
                                   for o in cgraph[tag:httype]))

    endpoints = sorted(endpoint
                       for endpoint in cgraph[:rdf.type:wf.state]
                       if not isinstance(endpoint, BNode)
                       and not any(o for hnatype in has_next_action_types
                                   for o in cgraph[endpoint:hnatype]))

    complicated = sorted(a_given_tag
                 for tt in tag_types
                 for a_given_tag in cgraph[:rdf.type:tt]
                 if not isinstance(a_given_tag, BNode)
                         and not [successor_tag
                          for htt in has_tag_types
                          for successor_tag in chain(t
                                                     for t in cgraph[a_given_tag:htt]
                                                     #if not isinstance(t, BNode)
                                        ,
                                                     # we don't actually need this for terminals
                                                     # we will need it later
                                                     #(t for b in cgraph[a_given_tag:htt]
                                                     #if isinstance(b, BNode)
                                                     #for listhead in cgraph[b:owl.oneOf]
                                                     #for t in unlist(listhead, cgraph)),
                         )])

    def topList(node, g):
        for s in g[:rdf.rest:node]:
            yield s

    def getLists(node, g):
        for linker in g[:rdf.first:node]:
            top = None
            for top in g.transitiveClosure(topList, linker):
                pass

            if top:
                yield top
            else:
                yield linker

    def getIsTagOf(node, g):
        for htt in has_tag_types:
            for parent_tag in g[:htt:node]:
                yield parent_tag

    def getIsOneOfTagOf(node, g):
        for list_top in getLists(node, g):
            for linker in g[:owl.oneOf:list_top]:
                for parent_tag, _ in g[::linker]:
                    yield parent_tag

    def getPreviousTag(node, g):  # not quite what we need
        yield from getIsOneOfTagOf(node, g)
        yield from getIsTagOf(node, g)

    def getTagChains(node, g, seen=tuple()):
        # seen to prevent recursion cases where
        # taggning can occur in either order e.g. PMID -> DOI
        #print(tc.red(repr(OntId(node))))  # tc.red(OntId(node)) does weird stuff O_o
        parent_tag = None
        for parent_tag in chain(getIsOneOfTagOf(node, g),
                                getIsTagOf(node, g)):
            if parent_tag in seen:
                parent_tag = None
                continue
            ptt = next(g[parent_tag:rdf.type])
            #if ptt in tag_types:
            for pchain in getTagChains(parent_tag, g, seen + (node,)):
                if ptt in tag_types:
                    out = parent_tag, *pchain
                else:
                    out = pchain
                yield out

            if not ptt and not out:
                parent_tag = None

        if not parent_tag:
            yield tuple()

    def getInitiatesAction(node, g):
        for action in g[:wf.initiatesAction:node]:
            yield action

    def getIsOneOfOutputOf(node, g):
        for list_top in getLists(node, g):
            for linker in g[:owl.oneOf:list_top]:
                for hot in has_next_action_types:
                    for parent_thing  in g[:hot:linker]:
                        yield parent_thing

    def getActionChains(node, g):
        parent_action = None
        for parent_action in chain(getIsOneOfOutputOf(node, g),  # works for actions too
                                   getInitiatesAction(node, g)):
            for pchain in getActionChains(parent_action, g):  # NOTE may also be a tag...
                out = parent_action, *pchain
                #print(tuple(hg.qname(o) for o in out))
                yield out

        if not parent_action:
            yield tuple()

    def getRestSubjects(predicate, object, g):
        """ invert restriction """
        rsco = cmb.Restriction(rdfs.subClassOf)
        for rt in rsco.parse(graph=g):
            if rt.p == predicate and rt.o == object:
                yield from g.transitive_subjects(rdfs.subClassOf, rt.s)

    annoParts = list(getRestSubjects(wf.isAttachedTo, wf.annotation, cgraph))
    partInstances = {OntId(a):set(t if isinstance(t, BNode) else OntId(t)
                                  for t in cgraph.transitive_subjects(rdf.type, a)
                                  if not isinstance(t, BNode) and t != a)
                     for a in annoParts}

    _endpoint_chains = {OntId(endpoint):[[OntId(endpoint)] + [OntId(e) for e in chain]
                                            for chain in getActionChains(endpoint, cgraph)]
                        for endpoint in endpoints}

    #print([hg.qname(e) for e in endpoints])
    #print([print([hg.qname(c) for c in getActionChains(endpoint, cgraph) if c])
           #for endpoint in endpoints
           #if endpoint])

    #_ = [print(list(getActionChains(e, cgraph)) for e in endpoints)]
    #return

    wat = cgraph.transitiveClosure(getPreviousTag, RRIDCUR.Duplicate)
    wat = list(wat)
    #def invOneOf(tag, g):

    fake_chains = {hg.qname(terminal):
                   [hg.qname(c)
                    for c in cgraph.transitiveClosure(getPreviousTag, terminal)]
                   for terminal in terminals}

    def make_chains(things, getChains):
        return {OntId(thing):[[OntId(thing)] + [OntId(e) for e in chain]
                              for chain in getChains(thing, cgraph)]
                for thing in things
                #if not print(thing)
        }

    def print_chains(thing_chains):
        print('\nstart from beginning')

        print('\n'.join(sorted(' -> '.join(hg.qname(e) for e in reversed(chain))
                               for chains in thing_chains.values()
                               for chain in chains)))

        print('\nstart from end')

        print('\n'.join(sorted(' <- '.join(e.curie for e in chain)
                               for chains in thing_chains.values()
                               for chain in chains)))

    def valid_tagsets(all_chains):
        # not the most efficient way to do this ...
        transitions = defaultdict(set)
        for end, chains in all_chains.items():
            for chain in chains:
                valid = set()
                prior_state = None
                for element in reversed(chain):
                    valid.add(element)
                    state = frozenset(valid)
                    transitions[prior_state].add(state)
                    prior_state = state

        return {s:frozenset(n) for s, n in transitions.items()}

    endpoint_chains = make_chains(endpoints, getActionChains)
    #endpoint_transitions = valid_transitions(endpoint_chains)  # not the right structure
    print_chains(endpoint_chains)
    terminal_chains = make_chains(terminals, getTagChains)
    print_chains(terminal_chains)
    tag_transitions = valid_tagsets(terminal_chains)
    terminal_tags_to_endpoints =  'TODO'

    def printq(*things):
        print(*(OntId(t).curie for t in things))

    from pprint import pprint
    def get_linkers(s, o, g, linkerFunc):  # FIXME not right
        for p in g[s::o]:
            yield p

        for l in linkerFunc(o, g):
            #print(tc.blue(f'{OntId(s).curie} {l if isinstance(l, BNode) else OntId(l).curie}'))
            for p in g[s::l]:
                #print(tc.red(f'{s} {l} {o} {p}'))
                yield p
        return 
        linkers = set(l for l in g.transitiveClosure(linkerFunc, o))
        for p, o in g[s::]:
            if o in linkers:
                yield p

    def edge_to_symbol(p, rev=False):
        if p == wf.initiatesAction:
            return '<<' if rev else '>>'
        elif p == wf.hasReplyTag:
            return '<' if rev else '>'
        elif p == wf.hasTagOrReplyTag:
            return '<=' if rev else '=>'
        elif p == wf.hasOutputTag:
            return '-<-' if rev else '->-'
        else:
            return '<??' if rev else '??>'

    def chain_to_typed_chain(chain, g, func):
        # duh...
        #pprint(chain)
        for s, o in zip(chain, chain[1:]):
            # TODO deal with reversed case
            s, o = s.u, o.u
            p = None
            #print(s, o)
            printq(s, o)
            for p in get_linkers(s, o, g, func):
                #print(tc.yellow(p))
                #yield (s, edge_to_symbol(p), o)
                yield from (s, edge_to_symbol(p), o)

            if not p:
                for rp in get_linkers(o, s, g, func):
                    print(tc.blue(rp))
                    yield from (s, edge_to_symbol(rp, rev=True), o)

    def tchains(thing_chains, func):
        return sorted([OntId(e).curie if isinstance(e, URIRef) else e
                       for e in chain_to_typed_chain(list(reversed(chain)), cgraph, func)]
                      for chains in thing_chains.values()
                      for chain in chains)

    def getLinkers(node, g):
        for list_top in getLists(node, g):
            for linker in g[:owl.oneOf:list_top]:
                yield linker

    def allSubjects(object, graph):
        yield from (s for s, p in graph[::object])
        yield from getLinkers(object, graph)

    print()
    ttc = tchains(terminal_chains, allSubjects)
    tec = tchains(endpoint_chains, allSubjects)
    pprint(ttc)
    pprint(tec)

    valid_tagsets = frozenset((t for s in tag_transitions.values() for t in s))
    tts = valid_tagsets - frozenset(tag_transitions)
    endtype = 'TODO'  # 
    tt = {}
    for endtype, chains  in endpoint_chains.items():
        for *_chain, tag in chains:
            if _chain:
                next_thing = _chain[-1]
            for ets in tts:
                if tag in ets:
                    tt[ets] = next_thing

    terminal_tagsets = tt

    #[print(wat) for wat in terminal_chains.values()]
    #pprint(terminal_chains)
    return tag_types, tag_tokens, partInstances, valid_tagsets, terminal_tagsets, tag_transitions
コード例 #6
0
ファイル: rdfdb.py プロジェクト: t00m/KB4IT
class KB4ITGraph:
    """
    This class creates a RDF graph based on attributes for each doc.
    Also it has convenient function to ask the graph
    """
    def __init__(self, path=None):
        """
        If not path is passed it build a graph in memory. Otherwise, it
        creates a persistent graph in disk.
        """
        if path is not None:
            # Create persistent Graph in disk
            self.path = path
            self.graph = ConjunctiveGraph('Sleepycat', URIRef("kb4it://"))
            graph_path = path + SEP + 'kb4it.graph'
            self.graph.store.open(graph_path)
        else:
            # Create Graph in Memory
            self.graph = ConjunctiveGraph('IOMemory')

        # Assign namespaces to the Namespace Manager of this graph
        namespace_manager = NamespaceManager(ConjunctiveGraph())
        for ns in NSBINDINGS:
            namespace_manager.bind(ns, NSBINDINGS[ns])
        self.graph.namespace_manager = namespace_manager


    def __uniq_sort(self, result):
        alist = list(result)
        aset = set(alist)
        alist = list(aset)
        alist.sort()
        return alist


    def subjects(self, predicate, object):
        """
        Returns a list of sorted and uniques subjects given a predicate
        and an object.
        """
        return self.__uniq_sort(self.graph.subjects(predicate, object))


    def predicates(self, subject=None, object=None):
        """
        Returns a list of sorted and uniques predicates given a subject
        and an object.
        """
        return self.__uniq_sort(self.graph.predicates(subject, object))


    def objects(self, subject, predicate):
        """
        Returns a list of sorted and uniques objects given a subject
        and an predicate.
        """
        return self.__uniq_sort(self.graph.objects(subject, predicate))


    def value(self, subject=None, predicate=None, object=None, default=None, any=True):
        """
        Returns a value given the subject and the predicate.
        """
        return self.graph.value(subject, predicate, object, default, any)


    def add_document(self, doc):
        """
        Add a new document to the graph.
        """
        subject = URIRef(doc)
        predicate = RDF['type']
        object = URIRef(KB4IT['Document'])
        self.graph.add([subject, predicate, object])


    def add_document_attribute(self, doc, attribute, value):
        """
        Add a new attribute to a document
        """
        predicate = 'has%s' % attribute
        subject = URIRef(doc)
        predicate = KB4IT[predicate]
        object = Literal(value)
        self.graph.add([subject, predicate, object])


    def get_attributes(self):
        """
        Get all predicates except RFD.type and Title
        """
        blacklist = set()
        blacklist.add(RDF['type'])
        blacklist.add(KB4IT['hasTitle'])
        alist = list(self.graph.predicates(None, None))
        aset = set(alist) - blacklist
        alist = list(aset)
        alist.sort()
        return alist


    def serialize(self):
        """
        Serialize graph to pretty xml format
        """
        return self.graph.serialize(format='pretty-xml')


    def close(self):
        """
        Close the graph if it is persistent.
        FIXME: check if it is open
        """
        self.graph.store.close()