class Entity(object): def __init__(self): self.__coll_dbpedia = None self.__coll_sf_facc = None self.__coll_sf_dbpedia = None self.__coll_fb2dbp = None def __init_coll_dbpedia(self): """Makes connection to the entity (DBpedia) collection.""" if self.__coll_dbpedia is None: self.__coll_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) def __init_coll_sf_facc(self): """Makes connection to the surface form collection.""" if self.__coll_sf_facc is None: self.__coll_sf_facc = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_FACC) def __init_coll_sf_dbpedia(self): """Makes connection to the surface form collection.""" if self.__coll_sf_dbpedia is None: self.__coll_sf_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_SF_DBPEDIA) def __init_coll_fb2dbp(self): """Makes connection to Freebase2DBpedia collection.""" if self.__coll_fb2dbp is None: self.__coll_fb2dbp = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_FREEBASE2DBPEDIA) def lookup_en(self, entity_id): """Looks up an entity by its identifier. :param entity_id: entity identifier ("<dbpedia:Audi_A4>") :return A dictionary with the entity document or None. """ self.__init_coll_dbpedia() return self.__coll_dbpedia.find_by_id(entity_id) def lookup_name_facc(self, name): """Looks up a name in a surface form dictionary and returns all candidate entities.""" self.__init_coll_sf_facc() res = self.__coll_sf_facc.find_by_id(name) return res if res else {} def lookup_name_dbpedia(self, name): """Looks up a name in a surface form dictionary and returns all candidate entities.""" self.__init_coll_sf_dbpedia() res = self.__coll_sf_dbpedia.find_by_id(name) return res if res else {} def fb_to_dbp(self, fb_id): """Converts Freebase id to DBpedia; it returns list of DBpedia IDs.""" self.__init_coll_fb2dbp() res = self.__coll_fb2dbp.find_by_id(fb_id) return res["!<owl:sameAs>"] if res else None
def create_sample_file(self): """Creates a sample file from the context of index""" mongo = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) example_docs = [ "<dbpedia:Texhoma,_Oklahoma>", "<dbpedia:Karen_Spärck_Jones>", "<dbpedia:Audi_A4>", "<dbpedia:Barack_Obama>" ] doc_contents = {} for docid in example_docs: doc_contents[docid] = self.get_doc_content(mongo.find_by_id(docid)) json.dump(doc_contents, open("output/example_docs.json", "w"), indent=4, sort_keys=True, ensure_ascii=False)
class Freebase2DBpedia2Mongo(object): def __init__(self, config): self.__check_config(config) self.__collection = config[KEY_COLLECTION] self.__fb2dbp_file = config[KEY_MAPPING_FILE] self.__fb2dbp_file_39 = config[ KEY_MAPPING_FILE_39] # used for removing duplicates self.__prefix = URIPrefix() self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB, MONGO_COLLECTION_DBPEDIA) @staticmethod def __check_config(config): """Checks params and set default values.""" try: if KEY_COLLECTION not in config: raise Exception(KEY_COLLECTION + " is missing") if KEY_MAPPING_FILE not in config: raise Exception(KEY_MAPPING_FILE + " is missing") if KEY_MAPPING_FILE_39 not in config: raise Exception(KEY_MAPPING_FILE_39 + " is missing") if not (os.path.exists(config[KEY_MAPPING_FILE])) or not ( os.path.exists(config[KEY_MAPPING_FILE_39])): raise Exception("Mapping file path does not exist.") except Exception as e: print("Error in config file: ", e) sys.exit(1) return config def read_fb2dbp_file(self, is_39=False): """Reads the file and generates an initial mapping of Freebase to DBpedia IDs. Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored. """ fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file print("Processing " + fb2dbp_file + "...") t = Triple() p = NTriplesParser(t) i = 0 fb2dbp_mapping = defaultdict(set) with FileUtils.open_file_by_type(fb2dbp_file) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # prefixing dbp_id = self.__prefix.get_prefixed(t.subject()) fb_id = self.__prefix.get_prefixed(t.object()) # if reading 3.9 file, converts ID to 2015-10 version if is_39: dbp_id = EntityUtils.convert_39_to_201510(dbp_id) fb2dbp_mapping[fb_id].add(dbp_id) # if reading 2015-10 file, keeps only the proper DBpedia entities else: entity_utils = EntityUtils( self.__mongo_dbpedia.find_by_id(dbp_id)) if entity_utils.is_entity(): fb2dbp_mapping[fb_id].add(dbp_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K lines are processed!") return fb2dbp_mapping def load_fb2dbp_mapping(self): """Checks Freebase IDs that are mapped to more than one entity and keeps only one of them.""" mappings = defaultdict(list) fb2dbp_39 = self.read_fb2dbp_file(is_39=True) fb2dbp = self.read_fb2dbp_file() for fb_id, dbp_ids in fb2dbp.items(): if len(dbp_ids) > 1: dbp_ids_39 = fb2dbp_39.get(fb_id, None) dbp_id_39 = dbp_ids_39.pop() if dbp_ids_39 else None if dbp_id_39 in dbp_ids: mappings[fb_id].append(dbp_id_39) else: mappings[fb_id] = list(dbp_ids) print(fb_id, "3.9", dbp_id_39, "2015", dbp_ids) else: mappings[fb_id] = list(dbp_ids) print(len(mappings)) return mappings def build_collection(self, mappings): """Builds Mongo collection""" mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection) mongo.drop() predicate = "!<owl:sameAs>" i = 0 for fb_id, dbp_ids in mappings.items(): for dbp_id in dbp_ids: mongo.append_set(fb_id, predicate, [dbp_id]) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K entities are added!")