def test_generate(): path = "tests" #dir_path = os.path.dirname(os.path.realpath(path)) + "/" + path dir_path = os.getcwd() + "/tests" src_path = dir_path + "/test2.turtle" dest_path = dir_path + "/test2.hdt" print("Source: %s" % src_path) print("Dest : %s" % dest_path) doc = hdt.generate_hdt(src_path, "aff4://foo") retcode = doc.save_to_hdt(dest_path) assert retcode == 0 document = HDTDocument(dest_path) (triples, triplesCard) = document.search_triples("missingtriple", "", "") assert triplesCard == 0 (triples, triplesCard) = document.search_triples("", "", "") assert triplesCard == 12 (triples, triplesCard) = document.search_triples( "aff4://5aea2dd0-32b4-4c61-a9db-677654be6f83//test_images/AFF4-L/dream.txt", "", "") assert triplesCard == 12 os.unlink(dest_path)
def extract_dbpedia(superclass): """ Get edgelist for superclass and all its subclasses """ edgelist = [] instances = set() doc = HDTDocument(run.config["kg_source"]) subject_limit = run.config["subject_limit"] predicate_limit = run.config["predicate_limit"] subclasses = query_subclasses(superclass) print("[Info] query instances for each subclass") for subclass in tqdm(subclasses): if subject_limit > 0: (triples, count) = doc.search_triples("", rdf + "type", subclass, limit=subject_limit) else: (triples, count) = doc.search_triples("", rdf + "type", subclass) for triple in triples: instances.add(triple[0]) print("[Info] query predicates for each instance") for subject in tqdm(instances): if predicate_limit > 0: triples = doc.search_triples(subject, "", "", limit=predicate_limit)[0] else: (triples, count) = doc.search_triples(subject, "", "") for triple in triples: # Either blacklist if not triple[1] in blacklist: edgelist.append((triple[0], triple[1])) # Or whitelist # if triple[1] in whitelist: # edgelist.append((triple[0], triple[1])) return list(set(edgelist)) # Exclude duplicate entity-property relations
def extract_wikidata(classname, typeproperty): doc = HDTDocument("kg/wikidata-20170313-all-BETA.hdt") wd = "http://www.wikidata.org/entity/" wdt = "http://www.wikidata.org/prop/direct/" wd_classes = { "BoxerWikidata" : "Q11338576", "CyclistWikidata": "Q2309784", "CapitalWikidata" : "Q5119", "CountryWikidata" : "Q6256", "MetroAreaWikidata" : "Q1907114", "GeographicRegionWikidata" : "Q82794", "FilmFestivalWikidata" : "Q220505", } edgelist = [] instances = set() (triples, count) = doc.search_triples("", f"{wdt}{typeproperty}", f"{wd}{wd_classes[classname]}") for triple in triples: instances.add(triple[0]) for instance in tqdm(instances, total=len(instances)): (triples, count) = doc.search_triples(instance, "", "") for triple in triples: if not triple[1] in blacklist: edgelist.append((triple[0], triple[1])) return list(set(edgelist)) # Exclude duplicate entity-property relations
def extract_by_instance(fn, wdt_class, property, out=True): doc = HDTDocument(fn) wd = "http://www.wikidata.org/entity/" wdt = "http://www.wikidata.org/prop/direct/" properties = {"instance_of": "P31", "occupation": "P106"} instances = set() (triples, count) = doc.search_triples("", f"{wdt}{properties[property]}", f"{wd}{wdt_class}") for triple in tqdm(triples, len(list(instances))): instances.add(triple[0]) with open(f'{wdt_class}.csv', "w") as csvfile: spamwriter = csv.writer(csvfile, delimiter=',') for instance in tqdm(instances, total=len(instances)): if out: pattern = (instance, "", "") else: pattern = ("", "", instance) (triples, count) = doc.search_triples(*pattern) for triple in triples: if out: spamwriter.writerow([triple[0], triple[1]]) else: spamwriter.writerow([triple[2], triple[1]])
class HDT(KG): def __init__(self, hdt_file_path: str): self.hdt = HDTDocument(hdt_file_path) def predicate_objects(self, subject: str) -> Iterator[Tuple[str, str]]: (triples, cardinality) = self.hdt.search_triples(subject, "", "") for s, p, o in triples: yield p, o def subjects(self, predicate: str, obj: str) -> Iterator[str]: (triples, cardinality) = self.hdt.search_triples("", predicate, obj) for s, p, o in triples: yield s def triples(self, subject: str, predicate: str, obj: str) -> Iterator[Tuple[str, str, str]]: (triples, cardinality) = self.hdt.search_triples(subject, predicate, obj) for s, p, o in triples: yield (s, p, o) def objects(self, subject: str, predicate: str) -> Iterator[str]: (triples, cardinality) = self.hdt.search_triples(subject, predicate, "") for s, p, o in triples: yield o def count(self, subject: str, predicate: str, obj: str) -> int: (triples, cardinality) = self.hdt.search_triples(subject, predicate, obj) return cardinality def total_triples(self) -> int: return self.hdt.total_triples def nb_subjects(self) -> int: return self.hdt.nb_subjects def nb_predicates(self) -> int: return self.hdt.nb_predicates def nb_objects(self) -> int: return self.hdt.nb_objects def nb_shared(self) -> int: return self.hdt.nb_shared def get_schema_description(self, resource: str) -> Optional[str]: """Get english description of the specified resource. Use the http://schema.org/description property. Trailing double quotes and @en are removed!""" for o in self.objects(resource, "http://schema.org/description"): if o.endswith("@en"): # delete trailing @en and double quotes input_str = o[1:len(o) - 4] # input_str = re.sub(r'\d+', '', input_str) # remove numbers # input_str = input_str.translate(str.maketrans("","", string.punctuation)) # Punctuation removal # input_str = input_str.strip().lower() # To remove leading and ending spaces and put it in lower case return input_str return None
def generate(): hdt_file = None output_filename = None if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'all_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'all_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 with open(output_filename, 'w', newline='') as file: (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) for (s, p, o) in subclass_triples: # store it in a csv file s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) writer.writerow([s_id, s, o_id, o]) # print ([s_id, s, o_id, o]) count += 1 print ('total entries = ', count)
def extract_classes(fn): doc = HDTDocument(fn) rdf_type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" types = set() types_dct = {} (triples, count) = doc.search_triples("", rdf_type, "") for triple in tqdm(triples, total=count): types.add(triple[2]) for type in tqdm(types): (instances, instance_count) = doc.search_triples("", rdf_type, type) types_dct[type] = instance_count return types_dct
def load_KG(path_file, predicate_string, orientation=True): # load the file according to the given predicate hdt_file = HDTDocument(path_file) (triples, cardinality) = hdt_file.search_triples('', predicate_string, '') for (s, _, o) in triples: if orientation: graph.add_edge(s, o) else: graph.add_edge(o, s)
def get_nb_triples(file_path: str, format: str) -> int: if format == 'nt': return wccount(file_path) elif format == 'hdt': doc = HDTDocument(file_path, indexed=False) _, nb_triples = doc.search_triples("", "", "") return nb_triples else: raise Exception(f'Unsupported RDF format: "{format}"')
def parsefile(self, file_path): """Parse an HDT file as an N-Triples file.""" from hdt import HDTDocument doc = HDTDocument(file_path, indexed=False) iterator, _ = doc.search_triples("", "", "") self.iterator = iterator self.parse()
class HDTFileConnector(DatabaseConnector): """A HDTFileConnector search for RDF triples in a HDT file""" def __init__(self, file): super(HDTFileConnector, self).__init__() self._hdt = HDTDocument(file) def search_triples(self, subject, predicate, obj, limit=0, offset=0): """ Get an iterator over all RDF triples matching a triple pattern. Args: - subject ``string`` - Subject of the triple pattern - predicate ``string`` - Predicate of the triple pattern - object ``string`` - Object of the triple pattern - limit ``int=0`` ``optional`` - LIMIT modifier, i.e., maximum number of RDF triples to read - offset ``int=0`` ``optional`` - OFFSET modifier, i.e., number of RDF triples to skip Returns: A Python iterator over RDF triples matching the given triples pattern """ subject = subject if (subject is not None) and ( not subject.startswith('?')) else "" predicate = predicate if (predicate is not None) and ( not predicate.startswith('?')) else "" obj = obj if (obj is not None) and (not obj.startswith('?')) else "" return self._hdt.search_triples(subject, predicate, obj, offset=offset, limit=limit) @property def nb_triples(self): return self._hdt.total_triples @property def nb_subjects(self): """Get the number of subjects in the database""" return self._hdt.nb_subjects @property def nb_predicates(self): """Get the number of predicates in the database""" return self._hdt.nb_predicates @property def nb_objects(self): """Get the number of objects in the database""" return self._hdt.nb_objects def from_config(config): """Build a HDTFileFactory from a config file""" if not os.path.isfile(config["file"]): raise Exception("Configuration file not found: {}".format( config["file"])) return HDTFileConnector(config["file"])
def get_rdf_reader(file_path, format='nt'): """Get an iterator over RDF triples from a file""" iterator = None nb_triples = 0 # load standard RDF formats using rdflib if format == 'nt' or format == 'ttl': g = Graph() g.parse(file_path, format=format) nb_triples = len(g) iterator = map(__n3_to_str, g.triples((None, None, None))) elif format == 'hdt': # load HDTDocument without additional indexes # they are not needed since we only search by "?s ?p ?o" doc = HDTDocument(file_path, indexed=False) iterator, nb_triples = doc.search_triples("", "", "") return iterator, nb_triples
def generate_reduced(): # Q1 : retrieve the subClassOf relations hdt_file = None output_filename = None output_selfloopClass_filename = None output_leafClass_filename = None output_intermediateClass_filename = None if sys.argv[1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'reduced_lod_subClassOf.csv' output_selfloopClass_filename = 'lod_selfloop_classes.csv' output_leafClass_filename = 'lod_leaf_classes.csv' output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'dbpedia_subClassOf.csv' output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv' output_leafClass_filename = 'dbpedia_leaf_classes.csv' output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") c1 = ['196338233', '196338418', '196338419'] c2 = ['196338233', '196338325', '196338412'] c3 = ['196337995', '196338014', '196338013'] c4 = ['196338014', '196338063', '196338410'] cs = [c1, c2, c3, c4] for c in cs: print('\n\n this cycle = ', c) for n in c: print('id =', n) name = hdt_file.convert_id(int(n), IdentifierPosition.Subject) print('name = ', name) s_id = hdt_file.convert_term(name, IdentifierPosition.Subject) print('when its subject = ', s_id) o_id = hdt_file.convert_term(name, IdentifierPosition.Object) print('when its object = ', o_id) print('==================================') c1 = ['1193056652', '1193056593', '1193056657'] c2 = ['1146303708', '1146299369', '1146331327'] c3 = ['196338400', '196338312', '196338288'] c4 = ['196338013', '196337995', '196338014'] c5 = ['196338242', '196338410', '196337957'] c6 = ['196338418', '196338419', '196338233'] c7 = ['196338233', '196338325', '196338412'] c8 = ['196338014', '196338063', '196338410'] c9 = ['196338014', '196337975', '196338007'] c10 = ['196338050', '196338049', '196337975'] c11 = ['196338197', '196338462', '196338406'] c12 = ['196338220', '196338217', '196338034'] c13 = ['196338145', '196338152', '196338419'] c14 = ['196338288', '196338116', '196337978'] c15 = ['196338070', '196338360', '196338241'] c16 = ['114657709', '114657713', '125181834'] cs = [ c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16 ] for c in cs: print('\n\n that cycle = ', c) for n in c: print('id =', n) name = hdt_file.convert_id(int(n), IdentifierPosition.Subject) print('name = ', name) s_id = hdt_file.convert_term(name, IdentifierPosition.Subject) print('when its subject = ', s_id) o_id = hdt_file.convert_term(name, IdentifierPosition.Object) print('when its object = ', o_id)
class HDTAssistedDataStore(MemoryDataStore): def __init__(self, lex=lexicon.standard): super(HDTAssistedDataStore, self).__init__(lex=lex) self.hdt = None def invalidateCachedMetadata(self, zip): aff4cache = os.path.join(expanduser("~"), ".aff4") cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:]) cached_turtle_index = cached_turtle + ".index.v1-1" for f in [cached_turtle, cached_turtle_index]: if os.path.exists(f): LOGGER.debug("Invalidating HDT index %s" % f) os.unlink(f) def createHDTviaLib(self, zip, cached_turtle): try: temp = tempfile.NamedTemporaryFile(delete=False) LOGGER.debug("Creating HDT index %s" % cached_turtle) LOGGER.debug("Creating temp turtle file for import %s" % temp.name) try: with zip.OpenZipSegment("information.turtle") as fd: streams.WriteAll(fd, temp) temp.close() except Exception as e: # no turtle yet return doc = hdt.generate_hdt(temp.name, "aff4://foo") retcode = doc.save_to_hdt(cached_turtle) if retcode != 0: print("rdf2hdt failed", -retcode, file=sys.stderr) else: pass except: traceback.print_exc() raise Exception("rdf2dht failed. Please make data_store.HAS_HDT=False until this is fixed. ") finally: os.unlink(temp.name) def loadMetadata(self, zip): # Load the turtle metadata. aff4cache = os.path.join(expanduser("~"), ".aff4") if not os.path.exists(aff4cache): try: os.makedirs(aff4cache) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise cached_turtle = os.path.join(aff4cache, "%s.hdt" % str(zip.urn)[7:]) if not os.path.exists(cached_turtle): self.createHDTviaLib(zip, cached_turtle) if os.path.exists(cached_turtle): # assume we have a HDT cache of turtle at this point self.hdt = HDTDocument(cached_turtle) # this implementation currently not tested # and it is super ugly. We are materializing all triples just to # list all the subjects. # TODO: Implement subject iterator in pyHDT def QuerySubject(self, graph, subject_regex=None): if graph == transient_graph: yield super(HDTAssistedDataStore, self).QuerySubject(transient_graph, subject_regex) subject_regex = re.compile(utils.SmartStr(subject_regex)) (triples, cardinality) = self.hdt.search_triples("", "?", "?") seen_subject = [] for (s,p,o) in triples: if subject_regex is not None and subject_regex.match(s): if s not in seen_subject: seen_subject.add(s) yield rdfvalue.URN().UnSerializeFromString(s) for s in super(HDTAssistedDataStore, self).QuerySubject(graph, subject_regex=subject_regex): if s not in seen_subject: seen_subject.add(s) yield s # not yet implemented def QueryPredicate(self, graph, predicate): if graph == transient_graph: yield super(HDTAssistedDataStore, self).QueryPredicate(transient_graph, predicate) yield super(HDTAssistedDataStore, self).QueryPredicate(graph, predicate) def QueryPredicateObject(self, graph, predicate, object): (triples, cardinality) = self.hdt.search_triples("", predicate, object) for (s,p,o) in triples: yield rdfvalue.URN(s) for subject in super(HDTAssistedDataStore, self).QueryPredicateObject(graph, predicate, object): yield subject def Get(self, graph, subject, attribute): if self.hdt == None: return super(HDTAssistedDataStore, self).Get(graph, subject, attribute) else: # we use a set here as we some implementations might pass up an object from # the persisted graph and the transient graph. The set lets us remove duplicates res = set(self.QuerySubjectPredicate(graph, subject, attribute)) if len(res) == 1: return list(res) return list(res) def QuerySubjectPredicate(self, graph, subject, predicate): for o in super(HDTAssistedDataStore, self).QuerySubjectPredicate(graph, subject, predicate): yield o if self.hdt == None: return if graph == transient_graph: return if isinstance(subject, rdfvalue.URN): subject = subject.SerializeToString() else: subject = utils.SmartUnicode(subject) if isinstance(predicate, rdfvalue.URN): predicate = predicate.SerializeToString() else: predicate = utils.SmartUnicode(predicate) (triples, cardinality) = self.hdt.search_triples(subject, predicate, "") for (s,p,o) in triples: if o.startswith("\""): # it is a literal (v,t) = o.split("^^") v = v.replace("\"", "") t = t[1:len(t)-1] datatype = rdflib.URIRef(t) if datatype in registry.RDF_TYPE_MAP: o = registry.RDF_TYPE_MAP[datatype](v) else: # Default to a string literal. o = rdfvalue.XSDString(v) elif o.startswith("<"): o = rdfvalue.URN(utils.SmartUnicode(o)) elif o.startswith("aff4://"): o = rdfvalue.URN(utils.SmartUnicode(o)) else: o = rdfvalue.URN(utils.SmartUnicode(o)) yield o def SelectSubjectsByPrefix(self, graph, prefix): if graph == transient_graph: yield super(HDTAssistedDataStore, self).SelectSubjectsByPrefix(transient_graph, prefix) yield super(HDTAssistedDataStore, self).SelectSubjectsByPrefix(graph, prefix) def QueryPredicatesBySubject(self, graph, subject): if graph == transient_graph: yield super(HDTAssistedDataStore, self).QueryPredicatesBySubject(transient_graph, subject) yield super(HDTAssistedDataStore, self).QueryPredicatesBySubject(graph, subject)
IdentifierPosition.Predicate) id_sameAs = hdt_lod.convert_term("http://www.w3.org/2002/07/owl#sameAs", IdentifierPosition.Predicate) id_subClassOf = hdt_lod.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) id_equivalentClass = hdt_lod.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) # output some stats of LOD-a-lot # we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs) print("# subjects:", "{:,}".format(hdt_lod.nb_subjects)) print("# predicates:", "{:,}".format(hdt_lod.nb_predicates)) print("# objects:", "{:,}".format(hdt_lod.nb_objects)) (triples, cardinality) = hdt_lod.search_triples("", "", "") print("# triples:", "{:,}".format(cardinality)) (triples, cardinality) = hdt_lod.search_triples_ids(0, id_type, 0) print("# rdf:type statements:", "{:,}".format(cardinality)) (triples, cardinality) = hdt_lod.search_triples( "", "http://www.w3.org/2000/01/rdf-schema#subClassOf", "") print("# rdfs:subClassOf statements:", "{:,}".format(cardinality)) (triples, cardinality) = hdt_lod.search_triples_ids(0, id_equivalentClass, 0) print("# owl:equivalentClass statements:", "{:,}".format(cardinality)) (triples, cardinality) = hdt_lod.search_triples("", "http://www.w3.org/2002/07/owl#sameAs", "") print("# owl:sameAs statements:", "{:,}".format(cardinality))
# # triples, cardinality = hdt.search_triples("", subPropertyOf, t) # print ('There are ', cardinality, 'subPropertyOf of owl:transitive properties') # for (s,p ,o) in triples: # print ('subPropertyOf: ', s) # # # triples, cardinality = hdt.search_triples("", subClassOf, t) # print ('There are ', cardinality, 'subclass of owl:transitive properties') # for (s,p ,o) in triples: # print ('subClassOf: ', s) trans_collect = set() inv_collect = set() triples, direct_trans_relations = hdt.search_triples("", type, t) print('There are ', direct_trans_relations, 'as typed by owl:transitive properties') for (s, p, o) in triples: trans_collect.add(str(s)) # # # and another http://www.cyc.com/2003/04/01/cyc#EquivalenceRelation # cyc_eq = 'http://www.cyc.com/2003/04/01/cyc#EquivalenceRelation' # triples, cardinality = hdt.search_triples("", type, cyc_eq) # print ('There are ', cardinality, 'type of cyc#eq properties') # for (s,p ,o) in triples: # trans_collect.add(str(s)) # # print ('So in total that is ', len(trans_collect)) count_trans_rel_triples = 0 for trans_rel in trans_collect:
for subj, pred, obj in g: count += 1 collect_triple_owl.add((str(subj), str(pred), str(obj))) collect_nodes.add(subj) collect_nodes.add(obj) print('**** In the original OWL scheme ****') print('there are in total ', len(collect_triple_owl), ' Triples') print('there are in total ', len(collect_nodes), ' Nodes') collect_triple_owl_lod = set() count_relations_between_nodes = Counter() for s in collect_nodes: for o in collect_nodes: (triples, cardinality) = hdt_file.search_triples(s, '', o) for (s, p, o) in triples: # if (s, p ,o) not in collect_triple_owl: collect_triple_owl_lod.add((str(s), str(p), str(o))) print('# collect triple in LOD: ', len(collect_triple_owl_lod)) collect_extra = collect_triple_owl_lod.difference(collect_triple_owl) print('# extra: ', len(collect_extra)) for (s, p, o) in collect_extra: print('They are:', s, p, o) # # for subj, pred, obj in collect_triple_owl: # if pred == subClassOf: # print (subj, obj)
def generate_reduced(): # Q1 : retrieve the subClassOf relations # hdt_file = None # output_filename = None # output_selfloopClass_filename = None # output_leafClass_filename = None # output_intermediateClass_filename = None # if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) # output_filename = 'reduced_lod_subClassOf.csv' output_selfloopClass_filename = 'lod_reflexive_classes.csv' output_leafClass_filename = 'lod_leaf_classes.csv' # output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' # else: # hdt_file = HDTDocument(PATH_DBpedia) # output_filename = 'dbpedia_subClassOf.csv' # output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv' # output_leafClass_filename = 'dbpedia_leaf_classes.csv' # output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 count_selfloop = 0 count_leaf = 0 count_left = 0 count_output_after_further_reduced = 0 # count left of the further reduced # removed_leaf_classes = [] (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") to_explore_ids = set() # to iterate through leaf_ids = set() removed_intermediate_ids = set() # removed intermediate nodes all_ids = set() with open(output_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) with open(output_intermediateClass_filename, 'w', newline='') as inter_file: writer_inter = csv.writer(inter_file) writer_inter.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) # Step 1: remove selfloops and leaf nodes with open(output_selfloopClass_filename, 'w', newline='') as selfloop_file: writer_selfloop = csv.writer(selfloop_file) writer_selfloop.writerow([ "ID", "URI"]) with open(output_leafClass_filename, 'w', newline='') as leaf_file: writer_leaf = csv.writer(leaf_file) writer_leaf.writerow([ "ID", "URI"]) for (s, p, o) in subclass_triples: s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) all_ids.add(s_id) all_ids.add(o_id) count += 1 # store it in a csv file if s == o: # self loop count_selfloop += 1 writer_selfloop.writerow([s_id, s]) else: (_, leaf_cardinality) = hdt_file.search_triples("", subClassOf, s) # test if it is a leaf node if leaf_cardinality == 0: # there is no subclass, this is a leaf node/class # write it to a file and store it writer_leaf.writerow([s_id, s]) leaf_ids.add(s_id) count_leaf += 1 # removed_leaf_classes.append(s) # else: # # write what's left to the file # # SKIP: find intermediate for now # count_left += 1 # writer.writerow([s_id, s, o_id, o]) print ('count leaf statements = ', count_leaf) print ('count leaf (as set) = ', len (leaf_ids)) print ('count total statements = ', count) print ('count_total nodes (as set) = ', len (all_ids)) print ('NOW Part 2: Further Reduce ') # further reduce it visited_sup = set() # near_leaf_sup = set() count_one = 0 count_loop = 0 for l_id in leaf_ids: count_loop += 1 (leaf_triples, cardinality) = hdt_file.search_triples_ids(l_id, id_subClassOf, 0) # get its superclass id : sup_id finished_this_leaf = False if cardinality == 1: (l_id, lp_id, sup_id) = leaf_triples.next() (_, sub_cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id) if sub_cardinality == 1: # remove this superclass count_one += 1 removed_intermediate_ids.add(sup_id) visited_sup.add(sup_id) (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0) for (sup_id, lp_id, supsup_id) in supsup_triples: to_explore_ids.add(supsup_id) finished_this_leaf = True # normal process if not finished_this_leaf: for (l_id, lp_id, sup_id) in leaf_triples: if (sup_id not in visited_sup): # lo_id = hdt_file.convert_term(lo, IdentifierPosition.Object) (sup_triples, cardinality_back) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id) supflag = True # if this superclass only has leaf nodes if cardinality_back != 1: for (child_id, lp_id, sup_id) in sup_triples: if child_id not in leaf_ids: sup_flag = False break if supflag: # near_leaf_sup.add(sup_id) removed_intermediate_ids.add(sup_id) (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0) for (sup_id, lp_id, supsup_id) in supsup_triples: to_explore_ids.add(supsup_id) else: to_explore_ids.add (sup_id) visited_sup.add (sup_id) if count_loop %100000 ==0: print ('leaf nodes processed:', count_loop) print ('count one = ', count_one) print ('near-leaf nodes = ', len (removed_intermediate_ids)) print ('total visited nodes = ', len (visited_sup)) print ('non-near-leaf nodes = ', len(visited_sup) - len(removed_intermediate_ids)) print ('to explore = ', len(to_explore_ids)) print ('*********** after this data processing, we have only ', len(to_explore_ids), ' to explore for the next step') # # finished data- proprocessing, record_to_explore_size = len (to_explore_ids) record_iteration = 0 continue_flag = True while (len(to_explore_ids) != 0 and continue_flag): # print ('still to explore : ', len(to_explore)) record_iteration +=1 # iternate through this and n_id = to_explore_ids.pop() (triples_id, cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, n_id) flag = True for (ns_id, np_id, no_id) in triples_id: # if each ns is either a leaf or intermediate but removed, then we remove it. # ns_id = hdt_file.convert_term(ns, IdentifierPosition.Object) if ns_id not in leaf_ids and ns_id not in removed_intermediate_ids: # Keep it for now flag = False break if flag == True: # we are sure to remove it removed_intermediate_ids.add (n_id) else: to_explore_ids.add (n_id) # add back :( if record_iteration == 10000: if record_to_explore_size != len (to_explore_ids): # print ('leaf nodes visited = ', count_leaf) print ('total leaf nodes = ', len(leaf_ids)) print ('accummulated removed intermediate = ', len (removed_intermediate_ids)) print ('still to explore = ', len (to_explore_ids)) print ('record to explore = ', record_to_explore_size) print ('changed = ', record_to_explore_size - len (to_explore_ids)) record_iteration = 0 record_to_explore_size = len (to_explore_ids) else: continue_flag = False # to write down the intermediate removed print ('*****size of leaf:', len (leaf_ids)) print ('*****size of removed intermediate node :', len (removed_intermediate_ids)) (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") for (s,p,o) in subclass_triples: s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) # count += 1 # store it in a csv file if s != o: # if s is not a leaf node and not a removed intermediate node if (s_id not in leaf_ids) and (s_id not in removed_intermediate_ids): # write what's left to the file count_output_after_further_reduced += 1 # print ('count output after further reduced', count_output_after_further_reduced) writer_inter.writerow([s_id, s, o_id, o]) # else: # print ('one of them') # else: # print ('nothing') print ('total entries = ', count) print ('total self-loops = ', count_selfloop) print ('total leaf nodes/classes = ', count_leaf) print ('total left = ', count_left) print ('perfectage of reduction: ', count_left/count) print ('=====AFTER FURTHER REDUCTION ======') print ('There are only ', count_output_after_further_reduced) print ('perfectage of reduction: ', count_output_after_further_reduced/count)
"Number of predicates": +document.nb_predicates, "Number of objects": +document.nb_objects, "Number of shared subject-object": +document.nb_shared } #Counts undeclared_classes_count, declared_classes_count = 0, 0 declared_properties_count, undeclared_properties_count = 0, 0 declared_individuals_count, reused_individuals_count, linked_individuals_count = 0, 0, 0 sameas_link_count, seeAlso_link_count, differentFrom_link_count, allDifferent_link_count = 0, 0, 0, 0 class_link_count, property_link_count = 0, 0 instanceTyping_link_count = 0 # Fetch all declared classes (triples, cardinality) = document.search_triples( "", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2000/01/rdf-schema#Class") for triple in triples: if triple[0] not in unique_classes: unique_classes.add(triple[0]) declared_classes_count += 1 (triples, cardinality) = document.search_triples( "", "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2002/07/owl#Class") for triple in triples: if triple[0] not in unique_classes: unique_classes.add(triple[0]) declared_classes_count += 1 # Fetch all used classes # Fetch all instances of a class
# print("query cardinality", cardinality) # for triple in triples: # print(triple) # Option 1: Extract from mappings # tp_a = ("?s", "http://swrc.ontoware.org/ontology#url", "?o") # tp_b = ("?s", "?p", "http://dh2010.cch.kcl.ac.uk/academic-programme/abstracts/papers/pdf/ab-753.pdf") # iterator = document.search_join([tp_a, tp_b]) # print("estimated join cardinality : %i" % len(iterator)) # for mapping in iterator: # print(mapping) # Option 2: Sequential search for triples musicians = list() edge_list = list() (triples, card) = document.search_triples("", rdf + "type", dbo + "MusicalArtist") for triple in triples: musicians.append(triple[0]) for musician in musicians: (triples, card) = document.search_triples(musician, dbo + "birthPlace", dbr + "Karlsruhe", limit=10) for triple in triples: edge_list.append((musician, triple[1])) print(edge_list) # Construct Graph sequentially, Iterate over edge_list
# PATH_LOD = './broader.hdt' type = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" dbpediaPerson = 'http://dbpedia.org/ontology/Person' foafperson = 'http://xmlns.com/foaf/0.1/Person' purlHasEarlierVersion = 'http://purl.org/pav/hasEarlierVersion' PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt" hdt = HDTDocument(PATH_LOD) def get_domain_and_label(t): domain = tldextract.extract(t).domain name1 = t.rsplit('/', 1)[-1] name2 = t.rsplit('#', 1)[-1] if len(name1) == 0: return (domain, name2) if len(name2) == 0: return (domain, name1) if len(name2) < len(name1): return (domain, name2) else: return (domain, name1) triples, cardinality = hdt.search_triples("", purlHasEarlierVersion, "") print('There are ', cardinality, 'purlHasEarlierVersion properties')
class EntityLinker(Component, Serializable): """ This class extracts from the knowledge base candidate entities for the entity mentioned in the question and then extracts triplets from Wikidata for the extracted entity. Candidate entities are searched in the dictionary where keys are titles and aliases of Wikidata entities and values are lists of tuples (entity_title, entity_id, number_of_relations). First candidate entities are searched in the dictionary by keys where the keys are entities extracted from the question, if nothing is found entities are searched in the dictionary using Levenstein distance between the entity and keys (titles) in the dictionary. """ def __init__(self, load_path: str, inverted_index_filename: str, entities_list_filename: str, q2name_filename: str, save_path: str = None, q2descr_filename: str = None, rel_ranker: RelRankerBertInfer = None, build_inverted_index: bool = False, kb_format: str = "hdt", kb_filename: str = None, label_rel: str = None, descr_rel: str = None, aliases_rels: List[str] = None, sql_table_name: str = None, sql_column_names: List[str] = None, lang: str = "en", use_descriptions: bool = False, lemmatize: bool = False, use_prefix_tree: bool = False, **kwargs) -> None: """ Args: load_path: path to folder with inverted index files save_path: path where to save inverted index files inverted_index_filename: file with dict of words (keys) and entities containing these words entities_list_filename: file with the list of entities from the knowledge base q2name_filename: name of file which maps entity id to name q2descr_filename: name of file which maps entity id to description rel_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer build_inverted_index: if "true", inverted index of entities of the KB will be built kb_format: "hdt" or "sqlite3" kb_filename: file with the knowledge base, which will be used for building of inverted index label_rel: relation in the knowledge base which connects entity ids and entity titles descr_rel: relation in the knowledge base which connects entity ids and entity descriptions aliases_rels: list of relations which connect entity ids and entity aliases sql_table_name: name of the table with the KB if the KB is in sqlite3 format sql_column_names: names of columns with subject, relation and object lang: language used use_descriptions: whether to use context and descriptions of entities for entity ranking lemmatize: whether to lemmatize tokens of extracted entity use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels **kwargs: """ super().__init__(save_path=save_path, load_path=load_path) self.morph = pymorphy2.MorphAnalyzer() self.lemmatize = lemmatize self.use_prefix_tree = use_prefix_tree self.inverted_index_filename = inverted_index_filename self.entities_list_filename = entities_list_filename self.build_inverted_index = build_inverted_index self.q2name_filename = q2name_filename self.q2descr_filename = q2descr_filename self.kb_format = kb_format self.kb_filename = kb_filename self.label_rel = label_rel self.aliases_rels = aliases_rels self.descr_rel = descr_rel self.sql_table_name = sql_table_name self.sql_column_names = sql_column_names self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None self.entities_index: Optional[List[str]] = None self.q2name: Optional[List[Tuple[str]]] = None self.lang_str = f"@{lang}" if self.lang_str == "@en": self.stopwords = set(stopwords.words("english")) elif self.lang_str == "@ru": self.stopwords = set(stopwords.words("russian")) self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.rel_ranker = rel_ranker self.use_descriptions = use_descriptions if self.use_prefix_tree: alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \ "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \ "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗" dictionary_words = list(self.inverted_index.keys()) self.searcher = LevenshteinSearcher(alphabet, dictionary_words) if self.build_inverted_index: if self.kb_format == "hdt": self.doc = HDTDocument(str(expand_path(self.kb_filename))) if self.kb_format == "sqlite3": self.conn = sqlite3.connect(str(expand_path(self.kb_filename))) self.cursor = self.conn.cursor() self.inverted_index_builder() self.save() else: self.load() def load(self) -> None: self.inverted_index = load_pickle(self.load_path / self.inverted_index_filename) self.entities_list = load_pickle(self.load_path / self.entities_list_filename) self.q2name = load_pickle(self.load_path / self.q2name_filename) def save(self) -> None: save_pickle(self.inverted_index, self.save_path / self.inverted_index_filename) save_pickle(self.entities_list, self.save_path / self.entities_list_filename) save_pickle(self.q2name, self.save_path / self.q2name_filename) if self.q2descr_filename is not None: save_pickle(self.q2descr, self.save_path / self.q2descr_filename) def __call__(self, entity_substr_batch: List[List[str]], entity_positions_batch: List[List[List[int]]] = None, context_tokens: List[List[str]] = None) -> Tuple[List[List[List[str]]], List[List[List[float]]]]: entity_ids_batch = [] confidences_batch = [] if entity_positions_batch is None: entity_positions_batch = [[[0] for i in range(len(entities_list))] for entities_list in entity_substr_batch] for entity_substr_list, entity_positions_list in zip(entity_substr_batch, entity_positions_batch): entity_ids_list = [] confidences_list = [] for entity_substr, entity_pos in zip(entity_substr_list, entity_positions_list): context = "" if self.use_descriptions: context = ' '.join(context_tokens[:entity_pos[0]]+["[ENT]"]+context_tokens[entity_pos[-1]+1:]) entity_ids, confidences = self.link_entity(entity_substr, context) entity_ids_list.append(entity_ids) confidences_list.append(confidences) entity_ids_batch.append(entity_ids_list) confidences_batch.append(confidences_list) return entity_ids_batch, confidences_batch def link_entity(self, entity: str, context: str = None) -> Tuple[List[str], List[float]]: confidences = [] if not entity: entities_ids = ['None'] else: candidate_entities = self.candidate_entities_inverted_index(entity) candidate_entities, candidate_names = self.candidate_entities_names(entity, candidate_entities) entities_ids, confidences, srtd_cand_ent = self.sort_found_entities(candidate_entities, candidate_names, entity, context) return entities_ids, confidences def candidate_entities_inverted_index(self, entity: str) -> List[Tuple[Any, Any, Any]]: word_tokens = nltk.word_tokenize(entity.lower()) candidate_entities = [] for tok in word_tokens: if len(tok) > 1: found = False if tok in self.inverted_index: candidate_entities += self.inverted_index[tok] found = True if self.lemmatize: morph_parse_tok = self.morph.parse(tok)[0] lemmatized_tok = morph_parse_tok.normal_form if lemmatized_tok in self.inverted_index: candidate_entities += self.inverted_index[lemmatized_tok] found = True if not found and self.use_prefix_tree: words_with_levens_1 = self.searcher.search(tok, d=1) for word in words_with_levens_1: candidate_entities += self.inverted_index[word[0]] candidate_entities = list(set(candidate_entities)) candidate_entities = [(entity[0], self.entities_list[entity[0]], entity[1]) for entity in candidate_entities] return candidate_entities def sort_found_entities(self, candidate_entities: List[Tuple[int, str, int]], candidate_names: List[List[str]], entity: str, context: str = None) -> Tuple[List[str], List[float], List[Tuple[str, str, int, int]]]: entities_ratios = [] for candidate, entity_names in zip(candidate_entities, candidate_names): entity_num, entity_id, num_rels = candidate fuzz_ratio = max([fuzz.ratio(name.lower(), entity) for name in entity_names]) entities_ratios.append((entity_num, entity_id, fuzz_ratio, num_rels)) srtd_with_ratios = sorted(entities_ratios, key=lambda x: (x[2], x[3]), reverse=True) if self.use_descriptions: num_to_id = {entity_num: entity_id for entity_num, entity_id, _, _ in srtd_with_ratios[:30]} entity_numbers = [entity_num for entity_num, _, _, _ in srtd_with_ratios[:30]] scores = self.rel_ranker.rank_rels(context, entity_numbers) top_rels = [score[0] for score in scores] entity_ids = [num_to_id[num] for num in top_rels] confidences = [score[1] for score in scores] else: entity_ids = [ent[1] for ent in srtd_with_ratios] confidences = [float(ent[2]) * 0.01 for ent in srtd_with_ratios] return entity_ids, confidences, srtd_with_ratios def candidate_entities_names(self, entity: str, candidate_entities: List[Tuple[int, str, int]]) -> Tuple[List[Tuple[int, str, int]], List[List[str]]]: entity_length = len(entity) candidate_names = [] candidate_entities_filter = [] for candidate in candidate_entities: entity_num = candidate[0] entity_id = candidate[1] entity_names = [] entity_names_found = self.q2name[entity_num] if len(entity_names_found[0]) < 6 * entity_length: entity_name = entity_names_found[0] entity_names.append(entity_name) if len(entity_names_found) > 1: for alias in entity_names_found[1:]: entity_names.append(alias) candidate_names.append(entity_names) candidate_entities_filter.append(candidate) return candidate_entities_filter, candidate_names def inverted_index_builder(self) -> None: log.debug("building inverted index") entities_set = set() id_to_label_dict = defaultdict(list) id_to_descr_dict = {} label_to_id_dict = {} label_triplets = [] alias_triplets_list = [] descr_triplets = [] if self.kb_format == "hdt": label_triplets, c = self.doc.search_triples("", self.label_rel, "") if self.aliases_rels is not None: for alias_rel in self.aliases_rels: alias_triplets, c = self.doc.search_triples("", alias_rel, "") alias_triplets_list.append(alias_triplets) if self.descr_rel is not None: descr_triplets, c = self.doc.search_triples("", self.descr_rel, "") if self.kb_format == "sqlite3": subject, relation, obj = self.sql_column_names query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{self.label_rel}";' res = self.cursor.execute(query) label_triplets = res.fetchall() if self.aliases_rels is not None: for alias_rel in self.aliases_rels: query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{alias_rel}";' res = self.cursor.execute(query) alias_triplets = res.fetchall() alias_triplets_list.append(alias_triplets) if self.descr_rel is not None: query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} WHERE {relation} = "{self.descr_rel}";' res = self.cursor.execute(query) descr_triplets = res.fetchall() for triplets in [label_triplets] + alias_triplets_list: for triplet in triplets: entities_set.add(triplet[0]) if triplet[2].endswith(self.lang_str): label = triplet[2].replace(self.lang_str, '').replace('"', '') id_to_label_dict[triplet[0]].append(label) label_to_id_dict[label] = triplet[0] for triplet in descr_triplets: entities_set.add(triplet[0]) if triplet[2].endswith(self.lang_str): descr = triplet[2].replace(self.lang_str, '').replace('"', '') id_to_descr_dict[triplet[0]].append(descr) popularities_dict = {} for entity in entities_set: if self.kb_format == "hdt": all_triplets, number_of_triplets = self.doc.search_triples(entity, "", "") popularities_dict[entity] = number_of_triplets if self.kb_format == "sqlite3": subject, relation, obj = self.sql_column_names query = f'SELECT COUNT({obj}) FROM {self.sql_table_name} WHERE {subject} = "{entity}";' res = self.cursor.execute(query) popularities_dict[entity] = res.fetchall()[0][0] entities_dict = {entity: n for n, entity in enumerate(entities_set)} inverted_index = defaultdict(list) for label in label_to_id_dict: tokens = re.findall(self.re_tokenizer, label.lower()) for tok in tokens: if len(tok) > 1 and tok not in self.stopwords: inverted_index[tok].append((entities_dict[label_to_id_dict[label]], popularities_dict[label_to_id_dict[label]])) self.inverted_index = dict(inverted_index) self.entities_list = list(entities_set) self.q2name = [id_to_label_dict[entity] for entity in self.entities_list] self.q2descr = [] if id_to_descr_dict: self.q2descr = [id_to_descr_dict[entity] for entity in self.entities_list]
# for (s, p, o) in triples: # if s[0] == '"': # s = s # else: # s = '<' + s + '>' # if o[0] == '"': # o = o # else: # o = '<' + o + '>' # p = '<' + p + '>' # file_broader.write(s +' '+ p+ ' ' + o + '.\n' ) # # narrower_ = '<' + narrower + '>' # # file_integrated.write(o +' '+ narrower + ' ' + s + '.\n' ) # triples, cardinality = hdt.search_triples("", narrowerTransitive, "") print('There are ', cardinality, 'narrowerTransitive properties') for (s, p, o) in triples: if s[0] == '"': s = s else: s = '<' + s + '>' if o[0] == '"': o = o else: o = '<' + o + '>' p = '<' + p + '>' file_narrower.write(s + ' ' + p + ' ' + o + '.\n') # file_integrated.write(s +' '+ p+ ' ' + o + '.\n' )
class WikiParser: """This class extract relations, objects or triplets from Wikidata HDT file""" def __init__(self, wiki_filename: str, lang: str = "@en", **kwargs) -> None: """ Args: wiki_filename: hdt file with wikidata lang: Russian or English language **kwargs: """ log.debug(f'__init__ wiki_filename: {wiki_filename}') wiki_path = expand_path(wiki_filename) self.description_rel = "http://schema.org/description" self.lang = lang self.document = HDTDocument(str(wiki_path)) def __call__(self, what_return: List[str], query_seq: List[List[str]], filter_info: List[Tuple[str]], order_info: namedtuple) -> List[List[str]]: """ Let us consider an example of the question "What is the deepest lake in Russia?" with the corresponding SPARQL query "SELECT ?ent WHERE { ?ent wdt:P31 wd:T1 . ?ent wdt:R1 ?obj . ?ent wdt:R2 wd:E1 } ORDER BY ASC(?obj) LIMIT 5" arguments: what_return: ["?obj"] query_seq: [["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"]] filter_info: [] order_info: order_info(variable='?obj', sorting_order='asc') """ extended_combs = [] combs = [] for n, query in enumerate(query_seq): unknown_elem_positions = [(pos, elem) for pos, elem in enumerate(query) if elem.startswith('?')] """ n = 0, query = ["?ent", "http://www.wikidata.org/prop/direct/P17", "http://www.wikidata.org/entity/Q159"] unknown_elem_positions = ["?ent"] n = 1, query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] unknown_elem_positions = [(0, "?ent")] n = 2, query = ["?ent", "http://www.wikidata.org/prop/direct/P4511", "?obj"] unknown_elem_positions = [(0, "?ent"), (2, "?obj")] """ if n == 0: combs = self.search(query, unknown_elem_positions) # combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] else: if combs: known_elements = [] extended_combs = [] for elem in query: if elem in combs[0].keys(): known_elements.append(elem) for comb in combs: """ n = 1 query = ["?ent", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] comb = {"?ent": "http://www.wikidata.org/entity/Q5513"} known_elements = ["?ent"], known_values = ["http://www.wikidata.org/entity/Q5513"] filled_query = ["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"] new_combs = [["http://www.wikidata.org/entity/Q5513", "http://www.wikidata.org/prop/direct/P31", "http://www.wikidata.org/entity/Q23397"], ...] extended_combs = [{"?ent": "http://www.wikidata.org/entity/Q5513"}, ...] """ known_values = [ comb[known_elem] for known_elem in known_elements ] for known_elem, known_value in zip( known_elements, known_values): filled_query = [ elem.replace(known_elem, known_value) for elem in query ] new_combs = self.search(filled_query, unknown_elem_positions) for new_comb in new_combs: extended_combs.append({**comb, **new_comb}) combs = extended_combs if combs: if filter_info: for filter_elem, filter_value in filter_info: combs = [ comb for comb in combs if filter_value in comb[filter_elem] ] if order_info.variable is not None: reverse = True if order_info.sorting_order == "desc" else False sort_elem = order_info.variable combs = sorted(combs, key=lambda x: float(x[sort_elem].split('^^')[0]. strip('"')), reverse=reverse) combs = [combs[0]] if what_return[-1].startswith("count"): combs = [[combs[0][key] for key in what_return[:-1]] + [len(combs)]] else: combs = [[elem[key] for key in what_return] for elem in combs] return combs def search( self, query: List[str], unknown_elem_positions: List[Tuple[int, str]]) -> List[Dict[str, str]]: query = list( map(lambda elem: "" if elem.startswith('?') else elem, query)) subj, rel, obj = query triplets, c = self.document.search_triples(subj, rel, obj) if rel == self.description_rel: triplets = [ triplet for triplet in triplets if triplet[2].endswith(self.lang) ] combs = [{elem: triplet[pos] for pos, elem in unknown_elem_positions} for triplet in triplets] return combs def find_label(self, entity: str) -> str: entity = str(entity).replace('"', '') if entity.startswith("Q"): # example: "Q5513" entity = "http://www.wikidata.org/entity/" + entity # "http://www.wikidata.org/entity/Q5513" if entity.startswith("http://www.wikidata.org/entity/"): labels, cardinality = self.document.search_triples( entity, "http://www.w3.org/2000/01/rdf-schema#label", "") # labels = [["http://www.wikidata.org/entity/Q5513", "http://www.w3.org/2000/01/rdf-schema#label", '"Lake Baikal"@en'], ...] for label in labels: if label[2].endswith(self.lang): found_label = label[2].strip(self.lang).replace('"', '') return found_label elif entity.endswith(self.lang): # entity: '"Lake Baikal"@en' entity = entity.strip(self.lang) return entity elif "^^" in entity: """ examples: '"1799-06-06T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime>' (date) '"+1642"^^<http://www.w3.org/2001/XMLSchema#decimal>' (number) """ entity = entity.split("^^")[0] for token in ["T00:00:00Z", "+"]: entity = entity.replace(token, '') return entity elif entity.isdigit(): return entity return "Not Found" def find_alias(self, entity: str) -> List[str]: aliases = [] if entity.startswith("http://www.wikidata.org/entity/"): labels, cardinality = self.document.search_triples( entity, "http://www.w3.org/2004/02/skos/core#altLabel", "") aliases = [ label[2].strip(self.lang).strip('"') for label in labels if label[2].endswith(self.lang) ] return aliases def find_rels(self, entity: str, direction: str, rel_type: str = "no_type") -> List[str]: if direction == "forw": triplets, num = self.document.search_triples( f"http://www.wikidata.org/entity/{entity}", "", "") else: triplets, num = self.document.search_triples( "", "", f"http://www.wikidata.org/entity/{entity}") if rel_type != "no_type": start_str = f"http://www.wikidata.org/prop/{rel_type}" else: start_str = "http://www.wikidata.org/prop/P" rels = [ triplet[1] for triplet in triplets if triplet[1].startswith(start_str) ] return rels
# if s[0] == '"': # s = s # else: # s = '<' + s + '>' # if o[0] == '"': # o = o # else: # o = '<' + o + '>' # p = '<' + p + '>' # file_narrower.write(s +' '+ p+ ' ' + o + '.\n' ) # # file_integrated.write(s +' '+ p+ ' ' + o + '.\n' ) # # # file.close() # narrowerGraph.serialize(destination='narrower.nt', format='nt') # triples, cardinality = hdt.search_triples("", broader, "") print('There are ', cardinality, 'isPartOf properties') for (s, p, o) in triples: if s[0] == '"': if s.count('"') > 2 or '%' in s or '\'' in s or '\\' in s: continue s = s else: s = '<' + s + '>' if o[0] == '"': if o.count('"') > 2 or '%' in o or '\'' in o or '\\' in o: continue o = o else: o = '<' + o + '>' p = '<' + p + '>'
class HDT_Dataset(): def __init__(self, **kwargs): self.__source = kwargs.get("file", None) if self.__source is None: raise TypeError() try: self.document = HDTDocument(self.__source) except Exception as e: logger.exception("Could not load HDT File from {}.".format( self.__source)) raise e self.card = None def __str__(self): return str(self.__source) def __len__(self): if self.card: return self.card else: (_, self.card) = self.document.search_triples("", "", "") return len(self) @property def distinct_subjects(self): return self.document.nb_subjects def random_subjects(self, size=100, weighted=True): logger.info(f"Generating a random sample, weighted = {weighted}") cardinality = len(self) sample = set() if size > len(self): raise Exception("Sample size exceeds dataset size") while (len(sample) < size): offset = random.randint(0, cardinality - 1) (triples, res_card) = self.document.search_triples("", "", "", limit=1, offset=offset) subject = tuple_to_triple(next(triples))[0] if weighted: if type(subject) == URIRef: sample.add(subject) else: # Get the degree of the subject (ts, subject_degree) = self.document.search_triples( subject, "", "") # Assume the minimum degree of all subjects = 1 min_degree = 1 min_probability = min_degree / self.document.nb_subjects # Minimum probaility of a subject to be chosen # Probability of the current subject to be chosen p = min_probability / (subject_degree / self.document.nb_subjects) # Draw random number r = random.random() if r < p: if type(subject) == URIRef: sample.add(subject) return sample def random_sample(self, size=100): cardinality = len(self) sample = set() if size > len(self): raise Exception("Sample size exceeds dataset size") while len(sample) < size: offset = random.randint(0, cardinality - 1) (triples, res_card) = self.document.search_triples("", "", "", limit=1, offset=offset) sample.add(tuple_to_triple(next(triples))) return list(sample) def outgoing_edges(self, terms, **kwargs): file = kwargs.get("file", None) total_cardinality = 0 for term in terms: (triples, cardinality) = self.document.search_triples(str(term), "", "") for triple in triples: file.write(tuple_to_ntriple(triple)) total_cardinality += cardinality return total_cardinality def random_edge(self, subject): (triples, cardinality) = self.document.search_triples(str(subject), "", "") if cardinality == 0: return None random_offset = random.randint(0, cardinality) (triples, cardinality) = self.document.search_triples(str(subject), "", "", limit=1, offset=random_offset) for triple in triples: object = tuple_to_triple(triple)[2] if type(object) is URIRef: return object return None
class SubP: # Initializer / Instance Attributes def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf" self.id_subPropertyOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subPropertyOf", IdentifierPosition.Predicate) self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty" self.id_equivalentProperty = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentProperty", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph_manager = None #equiClassManager(path_eq) print('set up the equivalence class manager') self.diagnosed_relations = [] # the result self.suggestion_on_relations = [ ] # from the manual decison and Joe's sameAs data. Triple self.leaf_classes = set() print('finished initialization') def setup_graph(self): print('set up the graph') (subclass_triple_ids, cardinality) = self.enquiry(query=(0, self.id_subPropertyOf, 0), mode="default") collect_pairs = [] for (s_id, _, o_id) in subclass_triple_ids: # add to the directed graph collect_pairs.append((s_id, o_id)) print('there are ', len(collect_pairs), 'edges') self.graph.add_edges_from(collect_pairs) def convert_to_id(self, term): if term == "akt742:Intangible-Thing": # this is the only class that has two different ids (as subject and object) return 2601100675 else: return self.hdt.convert_term(term, IdentifierPosition.Subject) def convert_to_term(self, id): if id == 2601100675: return "akt742:Intangible-Thing" # this is the only one that has two different ids (as subject and object) else: return self.hdt.convert_id(id, IdentifierPosition.Subject) def enquiry(self, query, mode="subp"): (s, p, o) = query if mode == "default": return self.hdt.search_triples_ids(s, p, o) else: # examine the filtered part first pass def print_info(self, sbj, obj): predicate_names = [ "http://sw.cyc.com/CycAnnotations_v1#label", "http://www.w3.org/2000/01/rdf-schema#comment", "http://www.w3.org/2000/01/rdf-schema#label" ] s_domain = tldextract.extract(sbj).domain o_domain = tldextract.extract(obj).domain # filter that domain # if (s_domain != DOMAIN and o_domain != DOMAIN): # # print (DOMAIN) print('SUBJECT: ', sbj) for p in predicate_names: (triples, cardinality) = self.hdt.search_triples(sbj, p, "") for (s, p, o) in triples: print('\tPREDICATE: ', p) print('\t\t Comments/labels :', o, '\n') print('OBJECT: ', obj) for p in predicate_names: (triples, cardinality) = self.hdt.search_triples(obj, p, "") for (s, p, o) in triples: print('\tPREDICATE: ', p) print('\t\t Comments/labels :', o, '\n') print('\n\n========================\n\n') def export_cycle(self): simp_c = list(nx.simple_cycles(self.graph)) print('find simple cycle in graph') print('there are ', len(simp_c), ' simple cycles') count1 = 0 count_others = 0 count_sameas = 0 count_eqProp = 0 count_bigger = 0 collect_self_loop = [] collect_eq = [] collect_others = [] collect_bigger = [] for c in simp_c: if len(c) == 1: count1 += 1 collect_self_loop.append(c) elif len(c) == 2: # print (c) # for n in c: # t = self.convert_to_term(n) # print ('\t', t) # print ('\n') l_term = self.convert_to_term(c[0]) r_term = self.convert_to_term(c[1]) # id_equivalentProperty (subclass_triple_ids, cardinality) = self.enquiry( query=(c[0], self.id_equivalentProperty, c[1]), mode="default") # if (self.equi_graph_manager.test_equivalent(l_term, r_term)): # print ('There is a owl:sameAs relation in between') # count_sameas += 1 # collect_eq.append(c) if (cardinality > 0): print('There is a owl:equivalentProperty in between') count_eqProp += 1 collect_eq.append(c) else: # self.print_info(c[0], l_term, c[1], r_term) # print ('a longer one for manual decision:',c ) # count_others += 1 collect_others.append(c) count_others += 1 else: count_bigger += 1 collect_bigger.append((c[0], c[1])) collect_bigger.append((c[1], c[2])) collect_bigger.append((c[2], c[0])) print('there are ', count1, ' reflexive cycles') print('there are ', count_sameas, ' sameAs relations') print('there are ', count_eqProp, ' eqProp relations') print('there are ', count_others, ' size-two cycles') print('there are ', count_bigger, ' bigger cycles') # export self-loop cycles: with open(file_name, 'w', newline='') as file: writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION", "DECISION" ]) # write to file # print ('collect self loop: ',collect_self_loop) for [s_id] in collect_self_loop: # convert s_term = self.convert_to_term(s_id) o_term = s_term writer.writerow([s_id, s_term, s_id, o_term, 'remove', 'o']) # removed from automatic method for (s_id, o_id) in collect_eq: # convert s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) writer.writerow([s_id, s_term, o_id, o_term, 'remove', 'e']) # removed from automatic method for (s_id, o_id) in collect_others: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) self.print_info(s_term, o_term) writer.writerow([s_id, s_term, o_id, o_term, 'remove', '2']) # removed from manual step writer.writerow([o_id, o_term, s_id, s_term, 'remove', '2']) # removed from manual step for (s_id, o_id) in collect_bigger: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) # print ('===a longer cycle ===', c) writer.writerow([s_id, s_term, o_id, o_term, 'remove', 'x']) # removed from manual step def load_removed(self): # 'pre-subP.csv' subp_file = open('pre-subP.csv', 'r') reader = csv.DictReader(subp_file) coll_removed = [] for row in reader: s_id = int(row["SUBJECT_ID"]) # s = row["SUBJECT"] o_id = int(row["OBJECT_ID"]) sug = row["SUGGESTION"] # should be remove if (sug == 'remove'): coll_removed.append((s_id, o_id)) print('number of removed edges:', len(coll_removed)) self.graph.remove_edges_from(coll_removed) def test_cycle(self): try: c = nx.find_cycle(self.graph) # change to simple_cycles ?? print('cycle = ', c) except Exception as e: # hint_not_working = True print('no cycle') def export_graph_nt(self, name): g = Graph() for (s_id, o_id) in self.graph.edges: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) bob = URIRef("http://www.w3.org/2000/01/rdf-schema#subPropertyOf") g.add((URIRef(s_term), bob, URIRef(o_term))) # print("--- printing raw triples ---") # for s, p, o in g: # print((s, p, o)) g.serialize(destination=name, format='nt')
import csv import urllib from hdt import HDTDocument import pandas as pd from constants import SEP from settings import (HDT_FILE, DATASET_FILE, OUTPUT_DATASET_FILE, STATS_FILE, PREDICATES_EXCLUDED, QUERY, RATIO) from functions import get_sujeto_atr, get_predicado_atr, get_objeto_atr # HDTDocument creation document = HDTDocument(HDT_FILE) # Se hace la consulta de los triples en funcion del sujeto/predicado/objeto (triples, cardinality) = document.search_triples("", "", QUERY) def query(query): print("{}: {} objetos.".format(query, cardinality)) #%% Procesamiento # triple = s p o lista_objetos = [] for triple in triples: s, p, o = triple sujeto_descripcion, sujeto_URI = get_sujeto_atr(s) lista_objetos.append(sujeto_URI[1:-1]) numero = 0
class KBEntityLinker(Component, Serializable): """ This class extracts from the knowledge base candidate entities for the entity mentioned in the question and then extracts triplets from Wikidata for the extracted entity. Candidate entities are searched in the dictionary where keys are titles and aliases of Wikidata entities and values are lists of tuples (entity_title, entity_id, number_of_relations). First candidate entities are searched in the dictionary by keys where the keys are entities extracted from the question, if nothing is found entities are searched in the dictionary using Levenstein distance between the entity and keys (titles) in the dictionary. """ def __init__(self, load_path: str, inverted_index_filename: str, entities_list_filename: str, q2name_filename: str, who_entities_filename: Optional[str] = None, save_path: str = None, q2descr_filename: str = None, descr_rank_score_thres: float = 0.0, freq_dict_filename: Optional[str] = None, entity_ranker: RelRankerBertInfer = None, build_inverted_index: bool = False, kb_format: str = "hdt", kb_filename: str = None, label_rel: str = None, descr_rel: str = None, aliases_rels: List[str] = None, sql_table_name: str = None, sql_column_names: List[str] = None, lang: str = "en", use_descriptions: bool = False, include_mention: bool = False, lemmatize: bool = False, use_prefix_tree: bool = False, **kwargs) -> None: """ Args: load_path: path to folder with inverted index files inverted_index_filename: file with dict of words (keys) and entities containing these words entities_list_filename: file with the list of entities from the knowledge base q2name_filename: name of file which maps entity id to name who_entities_filename: file with the list of entities in Wikidata, which can be answers to questions with "Who" pronoun, i.e. humans, literary characters etc. save_path: path where to save inverted index files q2descr_filename: name of file which maps entity id to description descr_rank_score_thres: if the score of the entity description is less than threshold, the entity is not added to output list freq_dict_filename: filename with frequences dictionary of Russian words entity_ranker: component deeppavlov.models.kbqa.rel_ranker_bert_infer build_inverted_index: if "true", inverted index of entities of the KB will be built kb_format: "hdt" or "sqlite3" kb_filename: file with the knowledge base, which will be used for building of inverted index label_rel: relation in the knowledge base which connects entity ids and entity titles descr_rel: relation in the knowledge base which connects entity ids and entity descriptions aliases_rels: list of relations which connect entity ids and entity aliases sql_table_name: name of the table with the KB if the KB is in sqlite3 format sql_column_names: names of columns with subject, relation and object lang: language used use_descriptions: whether to use context and descriptions of entities for entity ranking include_mention: whether to leave or delete entity mention from the sentence before passing to BERT ranker lemmatize: whether to lemmatize tokens of extracted entity use_prefix_tree: whether to use prefix tree for search of entities with typos in entity labels **kwargs: """ super().__init__(save_path=save_path, load_path=load_path) self.morph = pymorphy2.MorphAnalyzer() self.lemmatize = lemmatize self.use_prefix_tree = use_prefix_tree self.inverted_index_filename = inverted_index_filename self.entities_list_filename = entities_list_filename self.build_inverted_index = build_inverted_index self.q2name_filename = q2name_filename self.who_entities_filename = who_entities_filename self.q2descr_filename = q2descr_filename self.descr_rank_score_thres = descr_rank_score_thres self.freq_dict_filename = freq_dict_filename self.kb_format = kb_format self.kb_filename = kb_filename self.label_rel = label_rel self.aliases_rels = aliases_rels self.descr_rel = descr_rel self.sql_table_name = sql_table_name self.sql_column_names = sql_column_names self.inverted_index: Optional[Dict[str, List[Tuple[str]]]] = None self.entities_index: Optional[List[str]] = None self.q2name: Optional[List[Tuple[str]]] = None self.lang_str = f"@{lang}" if self.lang_str == "@en": self.stopwords = set(stopwords.words("english")) elif self.lang_str == "@ru": self.stopwords = set(stopwords.words("russian")) self.re_tokenizer = re.compile(r"[\w']+|[^\w ]") self.entity_ranker = entity_ranker self.use_descriptions = use_descriptions self.include_mention = include_mention if self.use_descriptions and self.entity_ranker is None: raise ValueError("No entity ranker is provided!") if self.use_prefix_tree: alphabet = "!#%\&'()+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz½¿ÁÄ" + \ "ÅÆÇÉÎÓÖ×ÚßàáâãäåæçèéêëíîïðñòóôöøùúûüýāăąćČčĐėęěĞğĩīİıŁłńňŌōőřŚśşŠšťũūůŵźŻżŽžơưșȚțəʻ" + \ "ʿΠΡβγБМавдежикмностъяḤḥṇṬṭầếờợ–‘’Ⅲ−∗" dictionary_words = list(self.inverted_index.keys()) self.searcher = LevenshteinSearcher(alphabet, dictionary_words) if self.build_inverted_index: if self.kb_format == "hdt": self.doc = HDTDocument(str(expand_path(self.kb_filename))) elif self.kb_format == "sqlite3": self.conn = sqlite3.connect(str(expand_path(self.kb_filename))) self.cursor = self.conn.cursor() else: raise ValueError( f'unsupported kb_format value {self.kb_format}') self.inverted_index_builder() self.save() else: self.load() def load_freq_dict(self, freq_dict_filename: str): with open(str(expand_path(freq_dict_filename)), 'r') as fl: lines = fl.readlines() pos_freq_dict = defaultdict(list) for line in lines: line_split = line.strip('\n').split('\t') if re.match("[\d]+\.[\d]+", line_split[2]): pos_freq_dict[line_split[1]].append( (line_split[0], float(line_split[2]))) nouns_with_freq = pos_freq_dict["s"] self.nouns_dict = {noun: freq for noun, freq in nouns_with_freq} def load(self) -> None: self.inverted_index = load_pickle(self.load_path / self.inverted_index_filename) self.entities_list = load_pickle(self.load_path / self.entities_list_filename) self.q2name = load_pickle(self.load_path / self.q2name_filename) if self.who_entities_filename: self.who_entities = load_pickle(self.load_path / self.who_entities_filename) if self.freq_dict_filename: self.load_freq_dict(self.freq_dict_filename) def save(self) -> None: save_pickle(self.inverted_index, self.save_path / self.inverted_index_filename) save_pickle(self.entities_list, self.save_path / self.entities_list_filename) save_pickle(self.q2name, self.save_path / self.q2name_filename) if self.q2descr_filename is not None: save_pickle(self.q2descr, self.save_path / self.q2descr_filename) def __call__( self, entity_substr_batch: List[List[str]], entity_positions_batch: List[List[List[int]]] = None, context_tokens: List[List[str]] = None ) -> Tuple[List[List[List[str]]], List[List[List[float]]]]: entity_ids_batch = [] confidences_batch = [] if entity_positions_batch is None: entity_positions_batch = [[[0] for i in range(len(entities_list))] for entities_list in entity_substr_batch] for entity_substr_list, entity_positions_list in zip( entity_substr_batch, entity_positions_batch): entity_ids_list = [] confidences_list = [] for entity_substr, entity_pos in zip(entity_substr_list, entity_positions_list): context = "" if self.use_descriptions: if self.include_mention: context = ' '.join( context_tokens[:entity_pos[0]] + ["[ENT]"] + context_tokens[entity_pos[0]:entity_pos[-1] + 1] + ["[ENT]"] + context_tokens[entity_pos[-1] + 1:]) else: context = ' '.join(context_tokens[:entity_pos[0]] + ["[ENT]"] + context_tokens[entity_pos[-1] + 1:]) entity_ids, confidences = self.link_entity( entity_substr, context) entity_ids_list.append(entity_ids) confidences_list.append(confidences) entity_ids_batch.append(entity_ids_list) confidences_batch.append(confidences_list) return entity_ids_batch, confidences_batch def link_entity(self, entity: str, context: Optional[str] = None, template_found: Optional[str] = None, cut_entity: bool = False) -> Tuple[List[str], List[float]]: confidences = [] if not entity: entities_ids = ['None'] else: candidate_entities = self.candidate_entities_inverted_index(entity) if cut_entity and candidate_entities and len( entity.split()) > 1 and candidate_entities[0][3] == 1: entity = self.cut_entity_substr(entity) candidate_entities = self.candidate_entities_inverted_index( entity) candidate_entities, candidate_names = self.candidate_entities_names( entity, candidate_entities) entities_ids, confidences, srtd_cand_ent = self.sort_found_entities( candidate_entities, candidate_names, entity, context) if template_found: entities_ids = self.filter_entities(entities_ids, template_found) return entities_ids, confidences def cut_entity_substr(self, entity: str): word_tokens = nltk.word_tokenize(entity.lower()) word_tokens = [ word for word in word_tokens if word not in self.stopwords ] normal_form_tokens = [ self.morph.parse(word)[0].normal_form for word in word_tokens ] words_with_freq = [(word, self.nouns_dict.get(word, 0.0)) for word in normal_form_tokens] words_with_freq = sorted(words_with_freq, key=lambda x: x[1]) return words_with_freq[0][0] def candidate_entities_inverted_index( self, entity: str) -> List[Tuple[Any, Any, Any]]: word_tokens = nltk.word_tokenize(entity.lower()) word_tokens = [ word for word in word_tokens if word not in self.stopwords ] candidate_entities = [] for tok in word_tokens: if len(tok) > 1: found = False if tok in self.inverted_index: candidate_entities += self.inverted_index[tok] found = True if self.lemmatize: morph_parse_tok = self.morph.parse(tok)[0] lemmatized_tok = morph_parse_tok.normal_form if lemmatized_tok != tok and lemmatized_tok in self.inverted_index: candidate_entities += self.inverted_index[ lemmatized_tok] found = True if not found and self.use_prefix_tree: words_with_levens_1 = self.searcher.search(tok, d=1) for word in words_with_levens_1: candidate_entities += self.inverted_index[word[0]] candidate_entities = Counter(candidate_entities).most_common() candidate_entities = [(entity_num, self.entities_list[entity_num], entity_freq, count) for \ (entity_num, entity_freq), count in candidate_entities] return candidate_entities def sort_found_entities( self, candidate_entities: List[Tuple[int, str, int]], candidate_names: List[List[str]], entity: str, context: str = None ) -> Tuple[List[str], List[float], List[Tuple[str, str, int, int]]]: entities_ratios = [] for candidate, entity_names in zip(candidate_entities, candidate_names): entity_num, entity_id, num_rels, tokens_matched = candidate fuzz_ratio = max( [fuzz.ratio(name.lower(), entity) for name in entity_names]) entities_ratios.append( (entity_num, entity_id, tokens_matched, fuzz_ratio, num_rels)) srtd_with_ratios = sorted(entities_ratios, key=lambda x: (x[2], x[3], x[4]), reverse=True) if self.use_descriptions: log.debug(f"context {context}") id_to_score = { entity_id: (tokens_matched, score) for _, entity_id, tokens_matched, score, _ in srtd_with_ratios[:30] } entity_ids = [ entity_id for _, entity_id, _, _, _ in srtd_with_ratios[:30] ] scores = self.entity_ranker.rank_rels(context, entity_ids) entities_with_scores = [(entity_id, id_to_score[entity_id][0], id_to_score[entity_id][1], score) for entity_id, score in scores] entities_with_scores = sorted(entities_with_scores, key=lambda x: (x[1], x[2], x[3]), reverse=True) entities_with_scores = [entity for entity in entities_with_scores if \ (entity[3] > self.descr_rank_score_thres or entity[2] == 100.0)] log.debug(f"entities_with_scores {entities_with_scores[:10]}") entity_ids = [entity for entity, _, _, _ in entities_with_scores] confidences = [score for _, _, _, score in entities_with_scores] else: entity_ids = [ent[1] for ent in srtd_with_ratios] confidences = [float(ent[2]) * 0.01 for ent in srtd_with_ratios] return entity_ids, confidences, srtd_with_ratios def candidate_entities_names( self, entity: str, candidate_entities: List[Tuple[int, str, int]] ) -> Tuple[List[Tuple[int, str, int]], List[List[str]]]: entity_length = len(entity) candidate_names = [] candidate_entities_filter = [] for candidate in candidate_entities: entity_num = candidate[0] entity_names = [] entity_names_found = self.q2name[entity_num] if len(entity_names_found[0]) < 6 * entity_length: entity_name = entity_names_found[0] entity_names.append(entity_name) if len(entity_names_found) > 1: for alias in entity_names_found[1:]: entity_names.append(alias) candidate_names.append(entity_names) candidate_entities_filter.append(candidate) return candidate_entities_filter, candidate_names def inverted_index_builder(self) -> None: log.debug("building inverted index") entities_set = set() id_to_label_dict = defaultdict(list) id_to_descr_dict = {} label_to_id_dict = {} label_triplets = [] alias_triplets_list = [] descr_triplets = [] if self.kb_format == "hdt": label_triplets, c = self.doc.search_triples("", self.label_rel, "") if self.aliases_rels is not None: for alias_rel in self.aliases_rels: alias_triplets, c = self.doc.search_triples( "", alias_rel, "") alias_triplets_list.append(alias_triplets) if self.descr_rel is not None: descr_triplets, c = self.doc.search_triples( "", self.descr_rel, "") if self.kb_format == "sqlite3": subject, relation, obj = self.sql_column_names query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\ f'WHERE {relation} = "{self.label_rel}";' res = self.cursor.execute(query) label_triplets = res.fetchall() if self.aliases_rels is not None: for alias_rel in self.aliases_rels: query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\ f'WHERE {relation} = "{alias_rel}";' res = self.cursor.execute(query) alias_triplets = res.fetchall() alias_triplets_list.append(alias_triplets) if self.descr_rel is not None: query = f'SELECT {subject}, {relation}, {obj} FROM {self.sql_table_name} '\ f'WHERE {relation} = "{self.descr_rel}";' res = self.cursor.execute(query) descr_triplets = res.fetchall() for triplets in [label_triplets] + alias_triplets_list: for triplet in triplets: entities_set.add(triplet[0]) if triplet[2].endswith(self.lang_str): label = triplet[2].replace(self.lang_str, '').replace('"', '') id_to_label_dict[triplet[0]].append(label) label_to_id_dict[label] = triplet[0] for triplet in descr_triplets: entities_set.add(triplet[0]) if triplet[2].endswith(self.lang_str): descr = triplet[2].replace(self.lang_str, '').replace('"', '') id_to_descr_dict[triplet[0]].append(descr) popularities_dict = {} for entity in entities_set: if self.kb_format == "hdt": all_triplets, number_of_triplets = self.doc.search_triples( entity, "", "") popularities_dict[entity] = number_of_triplets if self.kb_format == "sqlite3": subject, relation, obj = self.sql_column_names query = f'SELECT COUNT({obj}) FROM {self.sql_table_name} WHERE {subject} = "{entity}";' res = self.cursor.execute(query) popularities_dict[entity] = res.fetchall()[0][0] entities_dict = {entity: n for n, entity in enumerate(entities_set)} inverted_index = defaultdict(list) for label in label_to_id_dict: tokens = re.findall(self.re_tokenizer, label.lower()) for tok in tokens: if len(tok) > 1 and tok not in self.stopwords: inverted_index[tok].append( (entities_dict[label_to_id_dict[label]], popularities_dict[label_to_id_dict[label]])) self.inverted_index = dict(inverted_index) self.entities_list = list(entities_set) self.q2name = [ id_to_label_dict[entity] for entity in self.entities_list ] self.q2descr = [] if id_to_descr_dict: self.q2descr = [ id_to_descr_dict[entity] for entity in self.entities_list ] def filter_entities(self, entities: List[str], template_found: str) -> List[str]: if template_found in ["who is xxx?", "who was xxx?"]: entities = [ entity for entity in entities if entity in self.who_entities ] if template_found in ["what is xxx?", "what was xxx?"]: entities = [ entity for entity in entities if entity not in self.who_entities ] return entities