Ejemplo n.º 1
0
def main():
    index_name = "toy_index"

    mappings = {
        # "id": Elastic.notanalyzed_field(),
        "title": Elastic.analyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    docs = {
        1: {
            "title":
            "Rap God",
            "content":
            "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings"
        },
        2: {
            "title":
            "Lose Yourself",
            "content":
            "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me"
        },
        3: {
            "title":
            "Love The Way You Lie",
            "content":
            "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts"
        },
        4: {
            "title":
            "The Monster",
            "content": [
                "gonna gonna I'm friends with the monster",
                "That's under my bed Get along with the voices inside of my head"
            ]
        },
        5: {
            "title":
            "Beautiful",
            "content":
            "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone"
        }
    }

    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    elastic.add_docs_bulk(docs)
    print("index has been built")
Ejemplo n.º 2
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        # ID_KEY: Elastic.notanalyzed_field(),
        CONTENT_KEY: Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__model = config.get("model", Elastic.BM25)
        self.__index_name = config["index_name"]
        self.__type2entity_file = config["type2entity_file"]
        self.__entity_abstracts = {}
        self.__load_entity_abstracts(config["entity_abstracts_file"])

    @property
    def name(self):
        return self.__index_name

    def __load_entity_abstracts(self, filename):
        prefix = URIPrefix()
        t = Triple()
        p = NTriplesParser(t)
        lines_counter = 0
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            # basic line parsing
            line = line.decode("utf-8") if isinstance(line, bytes) else line
            try:
                p.parsestring(line)
            except ParseError:  # skip lines that couldn't be parsed
                continue
            if t.subject() is None:  # only if parsed as a triple
                continue

            # Subject and object identification
            subj = prefix.get_prefixed(t.subject())
            obj = ""
            if type(t.object()) is URIRef:
                # PLOGGER.error("Error: it is URIRef the parsed obj")
                pass
            else:
                obj = t.object().encode("utf-8")
                if len(obj) == 0:
                    continue  # skip empty objects
            self.__entity_abstracts[subj] = obj

            lines_counter += 1
            if lines_counter % 10000 == 0:
                PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                pass

        PLOGGER.info("\n### Loading entity abstracts... Done.")

    def __make_type_doc(self, entities, last_type):
        """Gets the document representation of a type to be indexed, from its entity short abstracts."""
        content = ABSTRACTS_SEPARATOR.join([self.__entity_abstracts.get(e, b"").decode("utf-8")
                                            for e in entities])

        if len(content) > MAX_BULKING_DOC_SIZE:

            PLOGGER.info("Type {} has content larger than allowed: {}.".format(last_type, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t. Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            amount_abstracts_to_sample = min(floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), len(entities))
            entities_sample = [entities[i] for i in sample(range(len(entities)), amount_abstracts_to_sample)]
            content = ""  # reset content

            for entity in entities_sample:
                new_content_candidate = (content + ABSTRACTS_SEPARATOR +
                                         self.__entity_abstracts.get(entity, b"").decode("utf-8"))
                # we add an abstract only if by doing so it will not exceed MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) <= MAX_BULKING_DOC_SIZE:
                    content = new_content_candidate
                else:
                    break

        return {CONTENT_KEY: content}

    def build_index(self, force=False):
        """Builds the index.

        :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default.
        :type force: bool
        :return:
        """
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)
        prefix = URIPrefix()

        # For indexing types in bulk
        types_bulk = {}  # dict from type id to type(=doc)

        # process type2entity file
        last_type = None
        entities = []
        lines_counter = 0
        types_counter = 0
        with FileUtils.open_file_by_type(self.__type2entity_file) as f:
            for line in f:
                line = line.decode()  # o.w. line is made of bytes
                if not line.startswith("<"):  # bad-formed lines in dataset
                    continue
                subj, obj = line.rstrip().split()

                type = prefix.get_prefixed(subj)  # subject prefixed
                entity = prefix.get_prefixed(obj)

                # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.)
                if not type.startswith(DBO_PREFIX):
                    continue

                if last_type is not None and type != last_type:
                    # moving to new type, so:
                    # create a doc for this type, with all the abstracts for its entities, and store it in a bulk
                    types_counter += 1
                    # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter,
                    #                                                                           last_type,
                    #                                                                           len(entities)))
                    types_bulk[last_type] = self.__make_type_doc(entities, last_type)
                    entities = []  # important to reset it

                    if types_counter % BULK_LEN == 0:  # index the bulk of BULK_LEN docs
                        self.__elastic.add_docs_bulk(types_bulk)
                        types_bulk.clear()  # NOTE: important to reset it
                        PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. "
                                     "{} types already indexed.".format(BULK_LEN, types_counter))

                last_type = type
                entities.append(entity)

                lines_counter += 1
                if lines_counter % 10000 == 0:
                    # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000))
                    pass
                pass

        # index the last type
        types_counter += 1

        PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type,
                                                                                         len(entities)))

        types_bulk[last_type] = self.__make_type_doc(entities, last_type)
        self.__elastic.add_docs_bulk(types_bulk)  # a tiny bulk :)
        # no need to reset neither entities nor types_bulk :P
        # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN))

        PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))