Exemple #1
0
class IndexerMongo(object):
    def __init__(self, index_name, mappings, collection, model=Elastic.BM25):
        self.__index_name = index_name
        self.__mappings = mappings
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, collection)
        self.__model = model

    def build(self, callback_get_doc_content, bulk_size=1000):
        """Builds the DBpedia index from the mongo collection.

        To speedup indexing, we index documents as a bulk.
        There is an optimum value for the bulk size; try to figure it out.

        :param callback_get_doc_content: a function that get a documet from mongo and return the content for indexing
        :param bulk_size: Number of documents to be added to the index as a bulk
        """
        PLOGGER.info("Building " + self.__index_name + " ...")
        elastic = Elastic(self.__index_name)
        elastic.create_index(self.__mappings, model=self.__model, force=True)

        i = 0
        docs = dict()
        for mdoc in self.__mongo.find_all(no_timeout=True):
            docid = Mongo.unescape(mdoc[Mongo.ID_FIELD])

            # get back document from mongo with keys and _id field unescaped
            doc = callback_get_doc_content(Mongo.unescape_doc(mdoc))
            if doc is None:
                continue
            docs[docid] = doc

            i += 1
            if i % bulk_size == 0:
                elastic.add_docs_bulk(docs)
                docs = dict()
                PLOGGER.info(str(i / 1000) + "K documents indexed")
        # indexing the last bulk of documents
        elastic.add_docs_bulk(docs)
        PLOGGER.info("Finished indexing (" + str(i) + " documents in total)")
class DBpediaSurfaceforms2Mongo(object):
    def __init__(self, config):
        """Inserts DBpedia surface forms to Mongo."""
        self.__check_config(config)
        self.__collection = config[KEY_COLLECTION]
        self.__lowercase = config[KEY_LOWERCASE]
        self.__mongo_dbpedia = Mongo(MONGO_HOST, MONGO_DB,
                                     MONGO_COLLECTION_DBPEDIA)
        self.__mongo = None

    @staticmethod
    def __check_config(config):
        """Checks config parameters and sets default values."""
        try:
            if KEY_COLLECTION not in config:
                raise Exception(KEY_COLLECTION + " is missing")
            if KEY_LOWERCASE not in config:
                config[KEY_LOWERCASE] = True
        except Exception as e:
            print("Error in config file: ", e)
            sys.exit(1)

    def __add_surface_form(self, surface_form, predicate, entity_id):
        """Adds a surface form (removes the disambiguation part form the surface form, if exists).

        :param surface_form: surface form for entity
        :param predicate: predicate that entity is extracted from e.g. <rdfs:label>
        :param entity_id: entity ID
        """
        if sys.getsizeof(surface_form) >= 1024:  # Mongo key limit
            return
        surface_form = surface_form.replace("(disambiguation)", "").strip()
        if self.__lowercase:
            surface_form = surface_form.lower()
        self.__mongo.inc_in_dict(surface_form, predicate, entity_id, 1)

    def build_collection(self):
        """Adds all name variants from DBpedia."""
        self.__mongo = Mongo(MONGO_HOST, MONGO_DB, self.__collection)
        self.__mongo.drop()

        # iterate through all DBpedia entities
        i = 0
        for mdoc in self.__mongo_dbpedia.find_all():
            entity = EntityUtils(Mongo.unescape_doc(mdoc))

            # skips entities without names
            if not entity.has_name():
                continue

            surface_form = entity.get_name()

            # the entity is redirect page
            if entity.is_redirect():
                entity_id = entity.get_predicate(
                    EntityUtils.PREDICATE_REDIRECT)[0]
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_REDIRECT,
                                        entity_id)

            # the entity is disambiguation page
            if entity.has_predicate(EntityUtils.PREDICATE_DISAMBIGUATE):
                entity_ids = entity.get_predicate(
                    EntityUtils.PREDICATE_DISAMBIGUATE)
                for entity_id in entity_ids:
                    self.__add_surface_form(surface_form,
                                            EntityUtils.PREDICATE_DISAMBIGUATE,
                                            entity_id)

            # entity is not a redirect/disambiguation page and has name and abstract
            if entity.is_entity():
                entity_id = entity.get_id()
                # adds entity name
                self.__add_surface_form(surface_form,
                                        EntityUtils.PREDICATE_NAME, entity_id)
                # adds other entity names
                foaf_name_predicate = "<foaf:name>"
                if entity.has_predicate(foaf_name_predicate):
                    for surface_form in entity.get_predicate(
                            foaf_name_predicate):
                        self.__add_surface_form(surface_form,
                                                foaf_name_predicate, entity_id)
            i += 1
            if i % 1000 == 0:
                print(str(i // 1000) + "K entities processed")