def verify_rdf(rdf_output): g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 6 assert len(set(g.subjects())) == 2 assert len(set(g.predicates())) == 3 assert len(set(g.objects())) == 6
def test_null_values_with_multiple_strings(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.multiple.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects for id in ['10', '11', '12', '13']: assert Literal(id, datatype=XSD.token) not in all_objects all_preds = {x for x in g.predicates()} assert id_uri not in all_preds assert Literal('1', datatype=XSD.token) not in all_objects
# or: primer.add((myNS['pat'], myNS['age'], Literal(24))) # Now, with just that, lets see how the system # recorded *way* too many details about what # you just asserted as fact. # from pprint import pprint pprint(list(primer)) # just think .whatever((s, p, o)) # here we report on what we know pprint(list(primer.subjects())) pprint(list(primer.predicates())) pprint(list(primer.objects())) # and other things that make sense # what do we know about pat? pprint(list(primer.predicate_objects(myNS.pat))) # who is what age? pprint(list(primer.subject_objects(myNS.age))) # Okay, so lets now work with a bigger # dataset from the example, and start # with a fresh new graph. primer = ConjunctiveGraph()
class PreProcessor(object): def __init__(self, kg_path): self.kg_path = kg_path self.ent_dict = dict() self.rel_dict = dict() self.g = ConjunctiveGraph() self.unique_msgs = self.ent_dict.copy() def load_knowledge_graph(self, format='xml', exclude_rels=[], clean_schema=True, amberg_params=None, excluded_entities=None): self.g.load(self.kg_path, format=format) # remove triples with excluded relation remove_rel_triples(self.g, exclude_rels) # remove triples with relations between class-level constructs if clean_schema: remove_rel_triples(self.g, schema_relations) if excluded_entities is not None: remove_ent_triples(self.g, excluded_entities) if amberg_params: path_to_events = amberg_params[0] max_events = amberg_params[1] self.merged = get_merged_dataframe(path_to_events, max_events) self.unique_msgs, unique_vars, unique_mods, unique_fes = get_unique_entities( self.merged) update_amberg_ontology(self.g, self.ent_dict, self.unique_msgs, unique_mods, unique_fes, unique_vars, self.merged) self.update_entity_relation_dictionaries() def update_entity_relation_dictionaries(self): """ Given an existing entity dictionary, update it to *ontology* :param ontology: :param ent_dict: the existing entity dictionary :return: """ ent_counter = 0 fixed_ids = set([id for id in self.ent_dict.values()]) # sorting ensures equal random splits on equal seeds for h in sorted( set(self.g.subjects(None, None)).union( set(self.g.objects(None, None)))): uni_h = unicode(h) if uni_h not in self.ent_dict: while ent_counter in fixed_ids: ent_counter += 1 self.ent_dict.setdefault(uni_h, ent_counter) ent_counter += 1 # add new relations to dict for r in sorted(set(self.g.predicates(None, None))): uni_r = unicode(r) if uni_r not in self.rel_dict: self.rel_dict.setdefault(uni_r, len(self.rel_dict)) def load_unique_msgs_from_txt(self, path, max_events=None): """ Assuming csv text files with two columns :param path: :return: """ with open(path, "rb") as f: for line in f: split = line.split(',') try: emb_id = int(split[1].strip()) except: print("Error reading id of {0} in given dictionary".format( line)) # skip this event entitiy, treat it as common entitiy later on continue self.ent_dict[split[0]] = emb_id # sort ascending w.r.t. embedding id, in case of later stripping # self.ent_dict = sorted(self.ent_dict.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = self.ent_dict.copy() if max_events is not None: all_msgs = sorted(self.unique_msgs.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = dict(all_msgs[:max_events]) excluded_events = dict(all_msgs[max_events:]).keys() return excluded_events def prepare_sequences(self, path_to_input, use_dict=True): """ Dumps pickle for sequences and dictionary :param data_frame: :param file_name: :param index: :param classification_event: :return: """ print("Preparing sequential data...") with open(path_to_input, "rb") as f: result = [] for line in f: entities = line.split(',') if use_dict: result.append([ int(e.strip()) for e in entities if int(e.strip()) in self.unique_msgs.values() ]) else: result.append([int(e.strip()) for e in entities]) print("Processed {0} sequences".format(len(result))) return result def get_vocab_size(self): return len(self.unique_msgs) def get_ent_dict(self): return self.ent_dict def get_rel_dict(self): return self.rel_dict def get_kg(self): return self.g def get_unique_msgs(self): return self.unique_msgs def get_merged(self): return self.merged
def parse_workflow(): # FIXME TODO these states should probably be compiled down to numbers??? docs = Path(__file__).parent.absolute().resolve().parent / 'docs' rridpath = docs / 'workflow-rrid.graphml' paperpath = docs / 'workflow-paper-id.graphml' cgraph = ConjunctiveGraph() gt.WorkflowMapping(rridpath.as_posix()).graph(cgraph) gt.PaperIdMapping(paperpath.as_posix(), False).graph(cgraph) write(cgraph, '/tmp/workflow.ttl') predicates = set(cgraph.predicates()) OntCuries({cp:str(ip) for cp, ip in cgraph.namespaces()}) OntCuries({'RRID': 'https://scicrunch.org/resolver/RRID:', 'DOI': 'https://doi.org/', 'PMID': 'https://www.ncbi.nlm.nih.gov/pubmed/'}) hg = makeGraph('', graph=cgraph) short = sorted(hg.qname(_) for _ in predicates) wf.hasTag wf.hasReplyTag wf.hasTagOrReplyTag wf.hasOutputTag #if type isa wf.tag tag_types = set(cgraph.transitive_subjects(rdfs.subClassOf, wf.tag)) tag_tokens = {tagType:sorted(set(t for t in cgraph.transitive_subjects(rdf.type, tagType) if t != tagType)) for tagType in tag_types} has_tag_types = set(cgraph.transitive_subjects(rdfs.subPropertyOf, wf.hasTagOrReplyTag)) has_tag_types.add(wf.hasOutputTag) has_next_action_types = set(cgraph.transitive_subjects(rdfs.subPropertyOf, wf.hasOutput)) has_next_action_types.add(wf.hasNextStep) terminals = sorted(tag for ttype in tag_types if ttype != wf.tagScibot # scibot is not 'terminal' for this part for tag in cgraph[:rdf.type:ttype] if not isinstance(tag, BNode) and not any(o for httype in has_tag_types for o in cgraph[tag:httype])) endpoints = sorted(endpoint for endpoint in cgraph[:rdf.type:wf.state] if not isinstance(endpoint, BNode) and not any(o for hnatype in has_next_action_types for o in cgraph[endpoint:hnatype])) complicated = sorted(a_given_tag for tt in tag_types for a_given_tag in cgraph[:rdf.type:tt] if not isinstance(a_given_tag, BNode) and not [successor_tag for htt in has_tag_types for successor_tag in chain(t for t in cgraph[a_given_tag:htt] #if not isinstance(t, BNode) , # we don't actually need this for terminals # we will need it later #(t for b in cgraph[a_given_tag:htt] #if isinstance(b, BNode) #for listhead in cgraph[b:owl.oneOf] #for t in unlist(listhead, cgraph)), )]) def topList(node, g): for s in g[:rdf.rest:node]: yield s def getLists(node, g): for linker in g[:rdf.first:node]: top = None for top in g.transitiveClosure(topList, linker): pass if top: yield top else: yield linker def getIsTagOf(node, g): for htt in has_tag_types: for parent_tag in g[:htt:node]: yield parent_tag def getIsOneOfTagOf(node, g): for list_top in getLists(node, g): for linker in g[:owl.oneOf:list_top]: for parent_tag, _ in g[::linker]: yield parent_tag def getPreviousTag(node, g): # not quite what we need yield from getIsOneOfTagOf(node, g) yield from getIsTagOf(node, g) def getTagChains(node, g, seen=tuple()): # seen to prevent recursion cases where # taggning can occur in either order e.g. PMID -> DOI #print(tc.red(repr(OntId(node)))) # tc.red(OntId(node)) does weird stuff O_o parent_tag = None for parent_tag in chain(getIsOneOfTagOf(node, g), getIsTagOf(node, g)): if parent_tag in seen: parent_tag = None continue ptt = next(g[parent_tag:rdf.type]) #if ptt in tag_types: for pchain in getTagChains(parent_tag, g, seen + (node,)): if ptt in tag_types: out = parent_tag, *pchain else: out = pchain yield out if not ptt and not out: parent_tag = None if not parent_tag: yield tuple() def getInitiatesAction(node, g): for action in g[:wf.initiatesAction:node]: yield action def getIsOneOfOutputOf(node, g): for list_top in getLists(node, g): for linker in g[:owl.oneOf:list_top]: for hot in has_next_action_types: for parent_thing in g[:hot:linker]: yield parent_thing def getActionChains(node, g): parent_action = None for parent_action in chain(getIsOneOfOutputOf(node, g), # works for actions too getInitiatesAction(node, g)): for pchain in getActionChains(parent_action, g): # NOTE may also be a tag... out = parent_action, *pchain #print(tuple(hg.qname(o) for o in out)) yield out if not parent_action: yield tuple() def getRestSubjects(predicate, object, g): """ invert restriction """ rsco = cmb.Restriction(rdfs.subClassOf) for rt in rsco.parse(graph=g): if rt.p == predicate and rt.o == object: yield from g.transitive_subjects(rdfs.subClassOf, rt.s) annoParts = list(getRestSubjects(wf.isAttachedTo, wf.annotation, cgraph)) partInstances = {OntId(a):set(t if isinstance(t, BNode) else OntId(t) for t in cgraph.transitive_subjects(rdf.type, a) if not isinstance(t, BNode) and t != a) for a in annoParts} _endpoint_chains = {OntId(endpoint):[[OntId(endpoint)] + [OntId(e) for e in chain] for chain in getActionChains(endpoint, cgraph)] for endpoint in endpoints} #print([hg.qname(e) for e in endpoints]) #print([print([hg.qname(c) for c in getActionChains(endpoint, cgraph) if c]) #for endpoint in endpoints #if endpoint]) #_ = [print(list(getActionChains(e, cgraph)) for e in endpoints)] #return wat = cgraph.transitiveClosure(getPreviousTag, RRIDCUR.Duplicate) wat = list(wat) #def invOneOf(tag, g): fake_chains = {hg.qname(terminal): [hg.qname(c) for c in cgraph.transitiveClosure(getPreviousTag, terminal)] for terminal in terminals} def make_chains(things, getChains): return {OntId(thing):[[OntId(thing)] + [OntId(e) for e in chain] for chain in getChains(thing, cgraph)] for thing in things #if not print(thing) } def print_chains(thing_chains): print('\nstart from beginning') print('\n'.join(sorted(' -> '.join(hg.qname(e) for e in reversed(chain)) for chains in thing_chains.values() for chain in chains))) print('\nstart from end') print('\n'.join(sorted(' <- '.join(e.curie for e in chain) for chains in thing_chains.values() for chain in chains))) def valid_tagsets(all_chains): # not the most efficient way to do this ... transitions = defaultdict(set) for end, chains in all_chains.items(): for chain in chains: valid = set() prior_state = None for element in reversed(chain): valid.add(element) state = frozenset(valid) transitions[prior_state].add(state) prior_state = state return {s:frozenset(n) for s, n in transitions.items()} endpoint_chains = make_chains(endpoints, getActionChains) #endpoint_transitions = valid_transitions(endpoint_chains) # not the right structure print_chains(endpoint_chains) terminal_chains = make_chains(terminals, getTagChains) print_chains(terminal_chains) tag_transitions = valid_tagsets(terminal_chains) terminal_tags_to_endpoints = 'TODO' def printq(*things): print(*(OntId(t).curie for t in things)) from pprint import pprint def get_linkers(s, o, g, linkerFunc): # FIXME not right for p in g[s::o]: yield p for l in linkerFunc(o, g): #print(tc.blue(f'{OntId(s).curie} {l if isinstance(l, BNode) else OntId(l).curie}')) for p in g[s::l]: #print(tc.red(f'{s} {l} {o} {p}')) yield p return linkers = set(l for l in g.transitiveClosure(linkerFunc, o)) for p, o in g[s::]: if o in linkers: yield p def edge_to_symbol(p, rev=False): if p == wf.initiatesAction: return '<<' if rev else '>>' elif p == wf.hasReplyTag: return '<' if rev else '>' elif p == wf.hasTagOrReplyTag: return '<=' if rev else '=>' elif p == wf.hasOutputTag: return '-<-' if rev else '->-' else: return '<??' if rev else '??>' def chain_to_typed_chain(chain, g, func): # duh... #pprint(chain) for s, o in zip(chain, chain[1:]): # TODO deal with reversed case s, o = s.u, o.u p = None #print(s, o) printq(s, o) for p in get_linkers(s, o, g, func): #print(tc.yellow(p)) #yield (s, edge_to_symbol(p), o) yield from (s, edge_to_symbol(p), o) if not p: for rp in get_linkers(o, s, g, func): print(tc.blue(rp)) yield from (s, edge_to_symbol(rp, rev=True), o) def tchains(thing_chains, func): return sorted([OntId(e).curie if isinstance(e, URIRef) else e for e in chain_to_typed_chain(list(reversed(chain)), cgraph, func)] for chains in thing_chains.values() for chain in chains) def getLinkers(node, g): for list_top in getLists(node, g): for linker in g[:owl.oneOf:list_top]: yield linker def allSubjects(object, graph): yield from (s for s, p in graph[::object]) yield from getLinkers(object, graph) print() ttc = tchains(terminal_chains, allSubjects) tec = tchains(endpoint_chains, allSubjects) pprint(ttc) pprint(tec) valid_tagsets = frozenset((t for s in tag_transitions.values() for t in s)) tts = valid_tagsets - frozenset(tag_transitions) endtype = 'TODO' # tt = {} for endtype, chains in endpoint_chains.items(): for *_chain, tag in chains: if _chain: next_thing = _chain[-1] for ets in tts: if tag in ets: tt[ets] = next_thing terminal_tagsets = tt #[print(wat) for wat in terminal_chains.values()] #pprint(terminal_chains) return tag_types, tag_tokens, partInstances, valid_tagsets, terminal_tagsets, tag_transitions
class KB4ITGraph: """ This class creates a RDF graph based on attributes for each doc. Also it has convenient function to ask the graph """ def __init__(self, path=None): """ If not path is passed it build a graph in memory. Otherwise, it creates a persistent graph in disk. """ if path is not None: # Create persistent Graph in disk self.path = path self.graph = ConjunctiveGraph('Sleepycat', URIRef("kb4it://")) graph_path = path + SEP + 'kb4it.graph' self.graph.store.open(graph_path) else: # Create Graph in Memory self.graph = ConjunctiveGraph('IOMemory') # Assign namespaces to the Namespace Manager of this graph namespace_manager = NamespaceManager(ConjunctiveGraph()) for ns in NSBINDINGS: namespace_manager.bind(ns, NSBINDINGS[ns]) self.graph.namespace_manager = namespace_manager def __uniq_sort(self, result): alist = list(result) aset = set(alist) alist = list(aset) alist.sort() return alist def subjects(self, predicate, object): """ Returns a list of sorted and uniques subjects given a predicate and an object. """ return self.__uniq_sort(self.graph.subjects(predicate, object)) def predicates(self, subject=None, object=None): """ Returns a list of sorted and uniques predicates given a subject and an object. """ return self.__uniq_sort(self.graph.predicates(subject, object)) def objects(self, subject, predicate): """ Returns a list of sorted and uniques objects given a subject and an predicate. """ return self.__uniq_sort(self.graph.objects(subject, predicate)) def value(self, subject=None, predicate=None, object=None, default=None, any=True): """ Returns a value given the subject and the predicate. """ return self.graph.value(subject, predicate, object, default, any) def add_document(self, doc): """ Add a new document to the graph. """ subject = URIRef(doc) predicate = RDF['type'] object = URIRef(KB4IT['Document']) self.graph.add([subject, predicate, object]) def add_document_attribute(self, doc, attribute, value): """ Add a new attribute to a document """ predicate = 'has%s' % attribute subject = URIRef(doc) predicate = KB4IT[predicate] object = Literal(value) self.graph.add([subject, predicate, object]) def get_attributes(self): """ Get all predicates except RFD.type and Title """ blacklist = set() blacklist.add(RDF['type']) blacklist.add(KB4IT['hasTitle']) alist = list(self.graph.predicates(None, None)) aset = set(alist) - blacklist alist = list(aset) alist.sort() return alist def serialize(self): """ Serialize graph to pretty xml format """ return self.graph.serialize(format='pretty-xml') def close(self): """ Close the graph if it is persistent. FIXME: check if it is open """ self.graph.store.close()