def build(self, callback_get_doc_content, bulk_size=1000): """Builds the DBpedia index from the mongo collection. To speedup indexing, we index documents as a bulk. There is an optimum value for the bulk size; try to figure it out. :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing :param bulk_size: Number of documents to be added to the index as a bulk """ PLOGGER.info("Building " + self.__index_name + " ...") elastic = Elastic(self.__index_name) elastic.create_index(self.__mappings, model=self.__model, force=True) i = 0 docs = dict() for mdoc in self.__mongo.find_all(no_timeout=True): docid = Mongo.unescape(mdoc[Mongo.ID_FIELD]) # get back document from mongo with keys and _id field unescaped doc = callback_get_doc_content(Mongo.unescape_doc(mdoc)) if doc is None: continue docs[docid] = doc i += 1 if i % bulk_size == 0: elastic.add_docs_bulk(docs) docs = dict() PLOGGER.info(str(i / 1000) + "K documents indexed") # indexing the last bulk of documents elastic.add_docs_bulk(docs) PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
def get_doc_content(self, doc): """create the index content for a given mongo document Here we keep both FSDM fields and individual fields for each document. :param doc: a Mongo document :return: a document ready for indexing """ # Ignores document if the ID does not start with "<dbpedia:" (just to speed up) doc_id = Mongo.unescape(doc[Mongo.ID_FIELD]) if not doc_id.startswith("<dbpedia:"): return None # Ignores document if it does not have must have fields for f in self._config["must_have"]: if f not in doc: return None self._doc_content = defaultdict(list) for f in doc: # Adds content for FSDM fields if f.lower() in self._config["names"]: self._doc_content["names"] += self.__get_field_value(doc[f]) elif f in self._config["categories"]: self._doc_content["categories"] += self.__get_field_value( doc[f]) elif f in self._config["similar_entity_names"]: self._doc_content[ "similar_entity_names"] += self.__get_field_value(doc[f]) elif f not in self._config["blacklist"]: if doc[f][0].startswith("<dbpedia:"): self._doc_content[ "related_entity_names"] += self.__get_field_value( doc[f], f) else: self._doc_content["attributes"] += self.__get_field_value( doc[f], f) # Adds content for each individual field if f in self.__top_fields: self._doc_content[f] += self.__get_field_value(doc[f]) # keeps only unique phrases for each field # Adds everything to the catchall field for field in self._fsdm_fields: self._doc_content[field] = list(set(self._doc_content[field])) self._doc_content[ Elastic.FIELD_CATCHALL] += self._doc_content[field] return self._doc_content