コード例 #1
0
ファイル: indexer_mongo.py プロジェクト: zxlzr/nordlys
    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
コード例 #2
0
    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities processed")