def build(self, callback_get_doc_content, bulk_size=1000): """Builds the DBpedia index from the mongo collection. To speedup indexing, we index documents as a bulk. There is an optimum value for the bulk size; try to figure it out. :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing :param bulk_size: Number of documents to be added to the index as a bulk """ PLOGGER.info("Building " + self.__index_name + " ...") elastic = Elastic(self.__index_name) elastic.create_index(self.__mappings, model=self.__model, force=True) i = 0 docs = dict() for mdoc in self.__mongo.find_all(no_timeout=True): docid = Mongo.unescape(mdoc[Mongo.ID_FIELD]) # get back document from mongo with keys and _id field unescaped doc = callback_get_doc_content(Mongo.unescape_doc(mdoc)) if doc is None: continue docs[docid] = doc i += 1 if i % bulk_size == 0: elastic.add_docs_bulk(docs) docs = dict() PLOGGER.info(str(i / 1000) + "K documents indexed") # indexing the last bulk of documents elastic.add_docs_bulk(docs) PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
class FACCToMongo(object): def __init__(self, config): """Inserts FACC surface forms to Mongo.""" self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__path = config[KEY_PATH] self.__predicate = config[KEY_PREDICATE] self.__lowercase = config[KEY_LOWERCASE] self.__mongo = None @staticmethod def __check_config(config): """Checks config parameters and sets default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_PATH not in config: raise Exception(KEY_PATH + " is missing") if KEY_PREDICATE not in config: raise Exception(KEY_PREDICATE + " is missing") if KEY_LOWERCASE not in config: config[KEY_LOWERCASE] = True except Exception as e: PLOGGER.error("Error in config file: ", e) sys.exit(1) def __add_surface_form(self, surface_form, freebase_uri, count): """Adds a surface form.""" if self.__lowercase: surface_form = surface_form.lower() # Increases count; if the id is not associated with the surface form yet, it adds it with count. freebase_id = self.__convert_to_fb_id(freebase_uri) self.__mongo.inc_in_dict(surface_form, self.__predicate, freebase_id, count) def __convert_to_fb_id(self, fb_uri): """Converts /m/047b9p0 to <fb:m.047b9p0>""" fb_id = fb_uri.replace("/", ".") return "<fb:" + fb_id[1:] + ">" def __add_file(self, tsv_filename): """Adds name variants from an FACC tsv file.""" PLOGGER.info("Adding name variants from '" + tsv_filename + "'...") infile = open(tsv_filename, "r") for line in infile: f = line.rstrip().split("\t") self.__add_surface_form(f[0], f[1], int(f[2])) infile.close() def build(self): """Builds surface form collection from FACC annotations.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() for path, dirs, files in os.walk(self.__path): for fn in files: if fn.endswith(".tsv"): self.__add_file(os.path.join(path, fn)) PLOGGER.info("Collection " + self.__collection + " is built.")
def __init__(self, config): """Inserts DBpedia surface forms to Mongo.""" self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__lowercase = config[KEY_LOWERCASE] self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) self.__mongo = None
def build(self): """Builds surface form collection from FACC annotations.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() for path, dirs, files in os.walk(self.__path): for fn in files: if fn.endswith(".tsv"): self.__add_file(os.path.join(path, fn))
def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__fb2dbp_file = config[KEY_MAPPING_FILE] self.__fb2dbp_file_39 = config[ KEY_MAPPING_FILE_39] # used for removing duplicates self.__prefix = URIPrefix() self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA)
class Entity(object): def __init__(self): self.__coll_dbpedia = None self.__coll_sf_facc = None self.__coll_sf_dbpedia = None self.__coll_fb2dbp = None def __init_coll_dbpedia(self): """Makes connection to the entity (DBpedia) collection.""" if self.__coll_dbpedia is None: self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) def __init_coll_sf_facc(self): """Makes connection to the surface form collection.""" if self.__coll_sf_facc is None: self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_FACC) def __init_coll_sf_dbpedia(self): """Makes connection to the surface form collection.""" if self.__coll_sf_dbpedia is None: self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_DBPEDIA) def __init_coll_fb2dbp(self): """Makes connection to Freebase2DBpedia collection.""" if self.__coll_fb2dbp is None: self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_FREEBASE2DBPEDIA) def lookup_en(self, entity_id): """Looks up an entity by its identifier. :param entity_id: entity identifier ("<dbpedia:Audi_A4>") :return A dictionary with the entity document or None. """ self.__init_coll_dbpedia() return self.__coll_dbpedia.find_by_id(entity_id) def lookup_name_facc(self, name): """Looks up a name in a surface form dictionary and returns all candidate entities.""" self.__init_coll_sf_facc() res = self.__coll_sf_facc.find_by_id(name) return res if res else {} def lookup_name_dbpedia(self, name): """Looks up a name in a surface form dictionary and returns all candidate entities.""" self.__init_coll_sf_dbpedia() res = self.__coll_sf_dbpedia.find_by_id(name) return res if res else {} def fb_to_dbp(self, fb_id): """Converts Freebase id to DBpedia; it returns list of DBpedia IDs.""" self.__init_coll_fb2dbp() res = self.__coll_fb2dbp.find_by_id(fb_id) return res["!<owl:sameAs>"] if res else None
def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: PLOGGER.info(str(i / 1000) + "K lines are loaded.")
class Word2VecToMongo(object): def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__w2v_fname = config[KEY_MAPPING_FILE] self.__mongo = None @staticmethod def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if not op.exists(config[KEY_MAPPING_FILE]): raise Exception("Mapping file path does not exist.") except Exception as e: print("Error in config file: ", e) exit(1) return config def __parse_line(self, line): """ Parses a line of the plain-text GoogleNews 300-dim pre-trained corpus. :param line: :type line: string :return: a (word, vector) tuple. """ word, vec_str = line.rstrip().split(maxsplit=1) vector = [float(x) for x in vec_str.split()] return word, vector def build(self): """Builds word2vec collection from GoogleNews 300-dim pre-trained corpus.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() infile = FileUtils.open_file_by_type(self.__w2v_fname) i = 0 for line in infile: term, vector = self.__parse_line(line) self.__mongo.add(term, {'vector': vector}) i += 1 if i % 1000 == 0: print(str(i / 1000) + "K lines are loaded.") # break pass
def build_collection(self): """Adds all name variants from DBpedia.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() # iterate through all DBpedia entities i = 0 for mdoc in self.__mongo_dbpedia.find_all(): entity = EntityUtils(Mongo.unescape_doc(mdoc)) # skips entities without names if not entity.has_name(): continue surface_form = entity.get_name() # the entity is redirect page if entity.is_redirect(): entity_id = entity.get_predicate( EntityUtils.PREDICATE_REDIRECT)[0] self.__add_surface_form(surface_form, EntityUtils.PREDICATE_REDIRECT, entity_id) # the entity is disambiguation page if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE): entity_ids = entity.get_predicate( EntityUtils.PREDICATE_DISAMBIGUATE) for entity_id in entity_ids: self.__add_surface_form(surface_form, EntityUtils.PREDICATE_DISAMBIGUATE, entity_id) # entity is not a redirect/disambiguation page and has name and abstract if entity.is_entity(): entity_id = entity.get_id() # adds entity name self.__add_surface_form(surface_form, EntityUtils.PREDICATE_NAME, entity_id) # adds other entity names foaf_name_predicate = "<foaf:name>" if entity.has_predicate(foaf_name_predicate): for surface_form in entity.get_predicate( foaf_name_predicate): self.__add_surface_form(surface_form, foaf_name_predicate, entity_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities processed")
def field_counts2json(out_file): """Reads all documents in the Mongo collection and calculates field frequencies. i.e. For DBpedia collection, it returns all entity fields. :param doc_collection: The name mongo collection stores all documents/entities. :return a dictionary of fields and their frequency """ print("Counting fields ...") dbpedia_coll = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA).find_all() i = 0 field_counts = dict() for entity in dbpedia_coll: for field in entity: if field == Mongo.ID_FIELD: continue if field in field_counts: field_counts[field] += 1 else: field_counts[field] = 1 i += 1 if i % 1000000 == 0: print("\t", str(int(i / 1000000)), "M entity is processed!") json.dump(field_counts, open(out_file, "w"), indent=4, sort_keys=True) print("\tField count file:", out_file)
def create_sample_file(self): """Creates a sample file from the context of index""" mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) example_docs = [ "<dbpedia:Texhoma,_Oklahoma>", "<dbpedia:Karen_Spärck_Jones>", "<dbpedia:Audi_A4>", "<dbpedia:Barack_Obama>" ] doc_contents = {} for docid in example_docs: doc_contents[docid] = self.get_doc_content(mongo.find_by_id(docid)) json.dump(doc_contents, open("output/example_docs.json", "w"), indent=4, sort_keys=True, ensure_ascii=False)
def get_doc_content(self, doc): """create the index content for a given mongo document Here we keep both FSDM fields and individual fields for each document. :param doc: a Mongo document :return: a document ready for indexing """ # Ignores document if the ID does not start with "<dbpedia:" (just to speed up) doc_id = Mongo.unescape(doc[Mongo.ID_FIELD]) if not doc_id.startswith("<dbpedia:"): return None # Ignores document if it does not have must have fields for f in self._config["must_have"]: if f not in doc: return None self._doc_content = defaultdict(list) for f in doc: # Adds content for FSDM fields if f.lower() in self._config["names"]: self._doc_content["names"] += self.__get_field_value(doc[f]) elif f in self._config["categories"]: self._doc_content["categories"] += self.__get_field_value( doc[f]) elif f in self._config["similar_entity_names"]: self._doc_content[ "similar_entity_names"] += self.__get_field_value(doc[f]) elif f not in self._config["blacklist"]: if doc[f][0].startswith("<dbpedia:"): self._doc_content[ "related_entity_names"] += self.__get_field_value( doc[f], f) else: self._doc_content["attributes"] += self.__get_field_value( doc[f], f) # Adds content for each individual field if f in self.__top_fields: self._doc_content[f] += self.__get_field_value(doc[f]) # keeps only unique phrases for each field # Adds everything to the catchall field for field in self._fsdm_fields: self._doc_content[field] = list(set(self._doc_content[field])) self._doc_content[ Elastic.FIELD_CATCHALL] += self._doc_content[field] return self._doc_content
def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities are added!")
def main(args): # word2vec main __instances w2v_mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC) w2v = Word2Vec(w2v_mongo) print( "\t\t*** word2vec functionalities, with word vectors from GoogleNews 300-dim pre-trained corpus. ***\n" ) # Testing some functionalities if args.word: word = args.word.strip() vector = w2v.get_vector(word) print("word = {}\nvector = {}\nvector dimension = {}\n".format( word, vector, vector.shape[0])) if args.centroid: str = args.centroid.strip() centroid_v = w2v.get_centroid_vector(str) print("expression = {}\ncentroid vector = {}\n".format( str, centroid_v))
def compute_field_counts(): """Reads all documents in the Mongo collection and calculates field frequencies. i.e. For DBpedia collection, it returns all entity fields. :return a dictionary of fields and their frequency """ PLOGGER.info("Counting fields ...") dbpedia_coll = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA).find_all() i = 0 field_counts = dict() for entity in dbpedia_coll: for field in entity: if field == Mongo.ID_FIELD: continue if field in field_counts: field_counts[field] += 1 else: field_counts[field] = 1 i += 1 if i % 1000000 == 0: PLOGGER.info("\t" + str(int(i / 1000000)) + "M entity is processed!") return field_counts
def __init_coll_sf_facc(self): """Makes connection to the surface form collection.""" if self.__coll_sf_facc is None: self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_FACC)
def __init__(self, host, db, collection): self.__mongo = Mongo(host, db, collection) self.__prefix = URIPrefix() self.__m_id = None self.__m_contents = None logging.basicConfig(level="ERROR") # no warnings from the rdf parser
class NTriplesToMongoDB(object): def __init__(self, host, db, collection): self.__mongo = Mongo(host, db, collection) self.__prefix = URIPrefix() self.__m_id = None self.__m_contents = None logging.basicConfig(level="ERROR") # no warnings from the rdf parser def _next_triple(self, subj, pred, obj): """Processes a triple. - Appends to previous triple if it's the same subject. - Otherwise inserts last triple and creates a new one. """ if (self.__m_id is not None) and (self.__m_id == subj): if pred not in self.__m_contents: self.__m_contents[pred] = [] self.__m_contents[pred].append(obj) else: self._write_to_mongo() self.__m_id = subj self.__m_contents = {pred: [obj]} def _write_to_mongo(self): """Writes triple (inserts or appends existing) to MongoDB collection.""" if self.__m_id is not None: for field, value in self.__m_contents.items(): self.__mongo.append_set(self.__m_id, field, value) # self.mongo.add(self.m_id, self.m_contents) self.__m_id = None self.__m_contents = None def drop(self): """Deletes the collection.""" self.__mongo.drop() def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()
def word2vec(self): if self.__word2vec is None: w2v_mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC) self.__word2vec = Word2Vec(w2v_mongo) return self.__word2vec
def __init_coll_sf_dbpedia(self): """Makes connection to the surface form collection.""" if self.__coll_sf_dbpedia is None: self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_DBPEDIA)
class Freebase2DBpedia2Mongo(object): def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__fb2dbp_file = config[KEY_MAPPING_FILE] self.__fb2dbp_file_39 = config[ KEY_MAPPING_FILE_39] # used for removing duplicates self.__prefix = URIPrefix() self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) @staticmethod def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if KEY_MAPPING_FILE_39 not in config: raise Exception(KEY_MAPPING_FILE_39 + " is missing") if not (os.path.exists(config[KEY_MAPPING_FILE])) or not ( os.path.exists(config[KEY_MAPPING_FILE_39])): raise Exception("Mapping file path does not exist.") except Exception as e: print("Error in config file: ", e) sys.exit(1) return config def read_fb2dbp_file(self, is_39=False): """Reads the file and generates an initial mapping of Freebase to DBpedia IDs. Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored. """ fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file print("Processing " + fb2dbp_file + "...") t = Triple() p = NTriplesParser(t) i = 0 fb2dbp_mapping = defaultdict(set) with FileUtils.open_file_by_type(fb2dbp_file) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # prefixing dbp_id = self.__prefix.get_prefixed(t.subject()) fb_id = self.__prefix.get_prefixed(t.object()) # if reading 3.9 file, converts ID to 2015-10 version if is_39: dbp_id = EntityUtils.convert_39_to_201510(dbp_id) fb2dbp_mapping[fb_id].add(dbp_id) # if reading 2015-10 file, keeps only the proper DBpedia entities else: entity_utils = EntityUtils( self.__mongo_dbpedia.find_by_id(dbp_id)) if entity_utils.is_entity(): fb2dbp_mapping[fb_id].add(dbp_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K lines are processed!") return fb2dbp_mapping def load_fb2dbp_mapping(self): """Checks Freebase IDs that are mapped to more than one entity and keeps only one of them.""" mappings = defaultdict(list) fb2dbp_39 = self.read_fb2dbp_file(is_39=True) fb2dbp = self.read_fb2dbp_file() for fb_id, dbp_ids in fb2dbp.items(): if len(dbp_ids) > 1: dbp_ids_39 = fb2dbp_39.get(fb_id, None) dbp_id_39 = dbp_ids_39.pop() if dbp_ids_39 else None if dbp_id_39 in dbp_ids: mappings[fb_id].append(dbp_id_39) else: mappings[fb_id] = list(dbp_ids) print(fb_id, "3.9", dbp_id_39, "2015", dbp_ids) else: mappings[fb_id] = list(dbp_ids) print(len(mappings)) return mappings def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities are added!")
def __init_coll_fb2dbp(self): """Makes connection to Freebase2DBpedia collection.""" if self.__coll_fb2dbp is None: self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_FREEBASE2DBPEDIA)
def __init_coll_dbpedia(self): """Makes connection to the entity (DBpedia) collection.""" if self.__coll_dbpedia is None: self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA)
def __init__(self, index_name, mappings, collection, model=Elastic.BM25): self.__index_name = index_name self.__mappings = mappings self.__mongo = Mongo(MONGO_HOST, MONGO_DB, collection) self.__model = model
def __init__(self): mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_WORD2VEC) self.__word2vec = Word2Vec(mongo)
class DBpediaSurfaceforms2Mongo(object): def __init__(self, config): """Inserts DBpedia surface forms to Mongo.""" self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__lowercase = config[KEY_LOWERCASE] self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) self.__mongo = None @staticmethod def __check_config(config): """Checks config parameters and sets default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_LOWERCASE not in config: config[KEY_LOWERCASE] = True except Exception as e: print("Error in config file: ", e) sys.exit(1) def __add_surface_form(self, surface_form, predicate, entity_id): """Adds a surface form (removes the disambiguation part form the surface form, if exists). :param surface_form: surface form for entity :param predicate: predicate that entity is extracted from e.g. <rdfs:label> :param entity_id: entity ID """ if sys.getsizeof(surface_form) >= 1024: # Mongo key limit return surface_form = surface_form.replace("(disambiguation)", "").strip() if self.__lowercase: surface_form = surface_form.lower() self.__mongo.inc_in_dict(surface_form, predicate, entity_id, 1) def build_collection(self): """Adds all name variants from DBpedia.""" self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) self.__mongo.drop() # iterate through all DBpedia entities i = 0 for mdoc in self.__mongo_dbpedia.find_all(): entity = EntityUtils(Mongo.unescape_doc(mdoc)) # skips entities without names if not entity.has_name(): continue surface_form = entity.get_name() # the entity is redirect page if entity.is_redirect(): entity_id = entity.get_predicate( EntityUtils.PREDICATE_REDIRECT)[0] self.__add_surface_form(surface_form, EntityUtils.PREDICATE_REDIRECT, entity_id) # the entity is disambiguation page if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE): entity_ids = entity.get_predicate( EntityUtils.PREDICATE_DISAMBIGUATE) for entity_id in entity_ids: self.__add_surface_form(surface_form, EntityUtils.PREDICATE_DISAMBIGUATE, entity_id) # entity is not a redirect/disambiguation page and has name and abstract if entity.is_entity(): entity_id = entity.get_id() # adds entity name self.__add_surface_form(surface_form, EntityUtils.PREDICATE_NAME, entity_id) # adds other entity names foaf_name_predicate = "<foaf:name>" if entity.has_predicate(foaf_name_predicate): for surface_form in entity.get_predicate( foaf_name_predicate): self.__add_surface_form(surface_form, foaf_name_predicate, entity_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities processed")