def index(mapper, bulk_size=10000): """Indexing""" pres_prof_mapping = gen_mappings() file = open(WP_ST_F, "r") index_name = WP_ST_INDEX_ID mappings = { "content": Elastic.analyzed_field(), "professions": Elastic.notanalyzed_field() } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) doc_id = 0 docs = {} for line in file: doc_id += 1 profs = [] while ("[" in line): # replace [A|B] with A matchObj = re.search('\[(.*?)\]', line) entity = matchObj.group(1).split("|")[0] name = entity.replace("_", " ") entity_id = mapper.get_id_from_person(name) prof_list = pres_prof_mapping[name] prof_list = [mapper.get_id_from_prof(prof) for prof in prof_list] profs += prof_list line = line.replace("[" + matchObj.group(1) + "]", entity_id) docs[doc_id] = {"content": line, "professions": list(set(profs))} if len(docs) == bulk_size: # bulk add 10000 sentences into elastic elastic.add_docs_bulk(docs) docs = {} print(doc_id / 1000, "K documents indexed.") # if len(docs) < 10000: # index the last butch of sentences elastic.add_docs_bulk(docs)
def build(self, callback_get_doc_content, bulk_size=1000): """Builds the DBpedia index from the mongo collection. To speedup indexing, we index documents as a bulk. There is an optimum value for the bulk size; try to figure it out. :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing :param bulk_size: Number of documents to be added to the index as a bulk """ PLOGGER.info("Building " + self.__index_name + " ...") elastic = Elastic(self.__index_name) elastic.create_index(self.__mappings, model=self.__model, force=True) i = 0 docs = dict() for mdoc in self.__mongo.find_all(no_timeout=True): docid = Mongo.unescape(mdoc[Mongo.ID_FIELD]) # get back document from mongo with keys and _id field unescaped doc = callback_get_doc_content(Mongo.unescape_doc(mdoc)) if doc is None: continue docs[docid] = doc i += 1 if i % bulk_size == 0: elastic.add_docs_bulk(docs) docs = dict() PLOGGER.info(str(i / 1000) + "K documents indexed") # indexing the last bulk of documents elastic.add_docs_bulk(docs) PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
def main(): index_name = "toy_index" mappings = { # "id": Elastic.notanalyzed_field(), "title": Elastic.analyzed_field(), "content": Elastic.analyzed_field(), } docs = { 1: { "title": "Rap God", "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings" }, 2: { "title": "Lose Yourself", "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me" }, 3: { "title": "Love The Way You Lie", "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts" }, 4: { "title": "The Monster", "content": [ "gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head" ] }, 5: { "title": "Beautiful", "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone" } } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) elastic.add_docs_bulk(docs) print("index has been built")
class IndexerDBpediaTypes(object): __DOC_TYPE = "doc" # we don't make use of types __MAPPINGS = { "id": Elastic.notanalyzed_field(), "content": Elastic.analyzed_field(), } def __init__(self, config): self.__elastic = None self.__config = config self.__index_name = config["index_name"] self.__dbpedia_path = config["dbpedia_files_path"] # For triple parsing self.__prefix = URIPrefix() self.__triple = Triple() self.__ntparser = NTriplesParser(self.__triple) # Entity abstract and type assignments kept in memory self.__entity_abstracts = {} self.__load_entity_abstracts() self.__types_entities = defaultdict(list) self.__load_entity_types() @property def name(self): return self.__index_name def __parse_line(self, line): """Parses a line from a ttl file and returns subject and object pair. It is used for parsing DBpedia abstracts and entity types. The subject is always prefixed. For object URIs, it is returned prefixed if from DBpedia otherwise None (i.e., types); literal objects are always returned (i.e., abstracts). """ line = line.decode("utf-8") if isinstance(line, bytes) else line try: self.__ntparser.parsestring(line) except ParseError: # skip lines that couldn't be parsed return None, None if self.__triple.subject() is None: # only if parsed as a triple return None, None subj = self.__prefix.get_prefixed(self.__triple.subject()) obj = None if type(self.__triple.object()) is URIRef: if self.__triple.object().startswith( "http://dbpedia.org/ontology"): obj = self.__prefix.get_prefixed(self.__triple.object()) else: obj = self.__triple.object().encode("utf-8") return subj, obj def __load_entity_abstracts(self): num_lines = 0 filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE]) PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, abstract = self.__parse_line(line) if abstract and len(abstract) > 0: # skip empty objects self.__entity_abstracts[entity] = abstract num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __load_entity_types(self): num_lines = 0 for types_file in ENTITY_TYPES_FILES: filename = os.sep.join([self.__dbpedia_path, types_file]) PLOGGER.info("Loading entity types from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, entity_type = self.__parse_line(line) if type(entity_type) != str: # Likely result of parsing error continue if not entity_type.startswith("<dbo:"): PLOGGER.info(" Non-DBpedia type: {}".format(entity_type)) continue if not entity.startswith("<dbpedia:"): PLOGGER.info(" Invalid entity: {}".format(entity)) continue self.__types_entities[entity_type].append(entity) num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __make_type_doc(self, type_name): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = "\n".join([ self.__entity_abstracts.get(e, b"").decode("utf-8") for e in self.__types_entities[type_name] ]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info("Type {} has content larger than allowed: {}.".format( type_name, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE num_entities = len(self.__types_entities[type_name]) amount_abstracts_to_sample = min( floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), num_entities) entities_sample = [ self.__types_entities[type_name][i] for i in sample( range(num_entities), amount_abstracts_to_sample) ] content = "" # reset content for entity in entities_sample: new_content_candidate = "\n".join([ content, self.__entity_abstracts.get(entity, b"").decode("utf-8") ]) # we add an abstract only if by doing so it will not exceed # MAX_BULKING_DOC_SIZE if len(new_content_candidate) > MAX_BULKING_DOC_SIZE: break content = new_content_candidate return {"content": content} def build_index(self, force=False): """Builds the index. Note: since DBpedia only has a few hundred types, no bulk indexing is needed. :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default. :type force: bool :return: """ PLOGGER.info("Building type index {}".format(self.__index_name)) self.__elastic = Elastic(self.__index_name) self.__elastic.create_index(mappings=self.__MAPPINGS, force=force) for type_name in self.__types_entities: PLOGGER.info(" Adding {} ...".format(type_name)) contents = self.__make_type_doc(type_name) self.__elastic.add_doc(type_name, contents) PLOGGER.info(" Done.")
class IndexerDBpediaTypes(object): __DOC_TYPE = "doc" # we don't make use of types __MAPPINGS = { # ID_KEY: Elastic.notanalyzed_field(), CONTENT_KEY: Elastic.analyzed_field(), } def __init__(self, config): self.__elastic = None self.__config = config self.__model = config.get("model", Elastic.BM25) self.__index_name = config["index_name"] self.__type2entity_file = config["type2entity_file"] self.__entity_abstracts = {} self.__load_entity_abstracts(config["entity_abstracts_file"]) @property def name(self): return self.__index_name def __load_entity_abstracts(self, filename): prefix = URIPrefix() t = Triple() p = NTriplesParser(t) lines_counter = 0 PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): # basic line parsing line = line.decode("utf-8") if isinstance(line, bytes) else line try: p.parsestring(line) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # Subject and object identification subj = prefix.get_prefixed(t.subject()) obj = "" if type(t.object()) is URIRef: # PLOGGER.error("Error: it is URIRef the parsed obj") pass else: obj = t.object().encode("utf-8") if len(obj) == 0: continue # skip empty objects self.__entity_abstracts[subj] = obj lines_counter += 1 if lines_counter % 10000 == 0: PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass PLOGGER.info("\n### Loading entity abstracts... Done.") def __make_type_doc(self, entities, last_type): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = ABSTRACTS_SEPARATOR.join([self.__entity_abstracts.get(e, b"").decode("utf-8") for e in entities]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info("Type {} has content larger than allowed: {}.".format(last_type, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE amount_abstracts_to_sample = min(floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), len(entities)) entities_sample = [entities[i] for i in sample(range(len(entities)), amount_abstracts_to_sample)] content = "" # reset content for entity in entities_sample: new_content_candidate = (content + ABSTRACTS_SEPARATOR + self.__entity_abstracts.get(entity, b"").decode("utf-8")) # we add an abstract only if by doing so it will not exceed MAX_BULKING_DOC_SIZE if len(new_content_candidate) <= MAX_BULKING_DOC_SIZE: content = new_content_candidate else: break return {CONTENT_KEY: content} def build_index(self, force=False): """Builds the index. :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default. :type force: bool :return: """ self.__elastic = Elastic(self.__index_name) self.__elastic.create_index(mappings=self.__MAPPINGS, force=force) prefix = URIPrefix() # For indexing types in bulk types_bulk = {} # dict from type id to type(=doc) # process type2entity file last_type = None entities = [] lines_counter = 0 types_counter = 0 with FileUtils.open_file_by_type(self.__type2entity_file) as f: for line in f: line = line.decode() # o.w. line is made of bytes if not line.startswith("<"): # bad-formed lines in dataset continue subj, obj = line.rstrip().split() type = prefix.get_prefixed(subj) # subject prefixed entity = prefix.get_prefixed(obj) # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.) if not type.startswith(DBO_PREFIX): continue if last_type is not None and type != last_type: # moving to new type, so: # create a doc for this type, with all the abstracts for its entities, and store it in a bulk types_counter += 1 # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter, # last_type, # len(entities))) types_bulk[last_type] = self.__make_type_doc(entities, last_type) entities = [] # important to reset it if types_counter % BULK_LEN == 0: # index the bulk of BULK_LEN docs self.__elastic.add_docs_bulk(types_bulk) types_bulk.clear() # NOTE: important to reset it PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. " "{} types already indexed.".format(BULK_LEN, types_counter)) last_type = type entities.append(entity) lines_counter += 1 if lines_counter % 10000 == 0: # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass pass # index the last type types_counter += 1 PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type, len(entities))) types_bulk[last_type] = self.__make_type_doc(entities, last_type) self.__elastic.add_docs_bulk(types_bulk) # a tiny bulk :) # no need to reset neither entities nor types_bulk :P # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN)) PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))