Exemple #1
0
def index(mapper, bulk_size=10000):
    """Indexing"""
    pres_prof_mapping = gen_mappings()
    file = open(WP_ST_F, "r")
    index_name = WP_ST_INDEX_ID
    mappings = {
        "content": Elastic.analyzed_field(),
        "professions": Elastic.notanalyzed_field()
    }
    elastic = Elastic(index_name)
    elastic.create_index(mappings, force=True)
    doc_id = 0
    docs = {}
    for line in file:
        doc_id += 1
        profs = []
        while ("[" in line):  # replace [A|B] with A
            matchObj = re.search('\[(.*?)\]', line)
            entity = matchObj.group(1).split("|")[0]
            name = entity.replace("_", " ")
            entity_id = mapper.get_id_from_person(name)
            prof_list = pres_prof_mapping[name]
            prof_list = [mapper.get_id_from_prof(prof) for prof in prof_list]
            profs += prof_list
            line = line.replace("[" + matchObj.group(1) + "]", entity_id)
        docs[doc_id] = {"content": line, "professions": list(set(profs))}
        if len(docs) == bulk_size:  # bulk add 10000 sentences into elastic
            elastic.add_docs_bulk(docs)
            docs = {}
            print(doc_id / 1000, "K documents indexed.")

    # if len(docs) < 10000: # index the last butch of sentences
    elastic.add_docs_bulk(docs)
Exemple #2
0
class IndexerDBpediaTypes(object):
    __DOC_TYPE = "doc"  # we don't make use of types
    __MAPPINGS = {
        "id": Elastic.notanalyzed_field(),
        "content": Elastic.analyzed_field(),
    }

    def __init__(self, config):
        self.__elastic = None
        self.__config = config
        self.__index_name = config["index_name"]
        self.__dbpedia_path = config["dbpedia_files_path"]
        # For triple parsing
        self.__prefix = URIPrefix()
        self.__triple = Triple()
        self.__ntparser = NTriplesParser(self.__triple)
        # Entity abstract and type assignments kept in memory
        self.__entity_abstracts = {}
        self.__load_entity_abstracts()
        self.__types_entities = defaultdict(list)
        self.__load_entity_types()

    @property
    def name(self):
        return self.__index_name

    def __parse_line(self, line):
        """Parses a line from a ttl file and returns subject and object pair.

        It is used for parsing DBpedia abstracts and entity types.
        The subject is always prefixed.
        For object URIs, it is returned prefixed if from DBpedia otherwise
        None (i.e., types); literal objects are always returned (i.e.,
        abstracts).
        """
        line = line.decode("utf-8") if isinstance(line, bytes) else line
        try:
            self.__ntparser.parsestring(line)
        except ParseError:  # skip lines that couldn't be parsed
            return None, None
        if self.__triple.subject() is None:  # only if parsed as a triple
            return None, None

        subj = self.__prefix.get_prefixed(self.__triple.subject())
        obj = None
        if type(self.__triple.object()) is URIRef:
            if self.__triple.object().startswith(
                    "http://dbpedia.org/ontology"):
                obj = self.__prefix.get_prefixed(self.__triple.object())
        else:
            obj = self.__triple.object().encode("utf-8")

        return subj, obj

    def __load_entity_abstracts(self):
        num_lines = 0
        filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE])
        PLOGGER.info("Loading entity abstracts from {}".format(filename))
        for line in FileUtils.read_file_as_list(filename):
            entity, abstract = self.__parse_line(line)
            if abstract and len(abstract) > 0:  # skip empty objects
                self.__entity_abstracts[entity] = abstract

            num_lines += 1
            if num_lines % 10000 == 0:
                PLOGGER.info("  {}K lines processed".format(num_lines // 1000))

        PLOGGER.info("  Done.")

    def __load_entity_types(self):
        num_lines = 0
        for types_file in ENTITY_TYPES_FILES:
            filename = os.sep.join([self.__dbpedia_path, types_file])
            PLOGGER.info("Loading entity types from {}".format(filename))
            for line in FileUtils.read_file_as_list(filename):
                entity, entity_type = self.__parse_line(line)
                if type(entity_type) != str:  # Likely result of parsing error
                    continue
                if not entity_type.startswith("<dbo:"):
                    PLOGGER.info("  Non-DBpedia type: {}".format(entity_type))
                    continue
                if not entity.startswith("<dbpedia:"):
                    PLOGGER.info("  Invalid entity: {}".format(entity))
                    continue
                self.__types_entities[entity_type].append(entity)

                num_lines += 1
                if num_lines % 10000 == 0:
                    PLOGGER.info("  {}K lines processed".format(num_lines //
                                                                1000))
            PLOGGER.info("  Done.")

    def __make_type_doc(self, type_name):
        """Gets the document representation of a type to be indexed, from its
        entity short abstracts."""
        content = "\n".join([
            self.__entity_abstracts.get(e, b"").decode("utf-8")
            for e in self.__types_entities[type_name]
        ])

        if len(content) > MAX_BULKING_DOC_SIZE:
            PLOGGER.info("Type {} has content larger than allowed: {}.".format(
                type_name, len(content)))

            # we randomly sample a subset of Y entity abstracts, s.t.
            # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE
            num_entities = len(self.__types_entities[type_name])
            amount_abstracts_to_sample = min(
                floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN),
                num_entities)
            entities_sample = [
                self.__types_entities[type_name][i] for i in sample(
                    range(num_entities), amount_abstracts_to_sample)
            ]
            content = ""  # reset content
            for entity in entities_sample:
                new_content_candidate = "\n".join([
                    content,
                    self.__entity_abstracts.get(entity, b"").decode("utf-8")
                ])
                # we add an abstract only if by doing so it will not exceed
                # MAX_BULKING_DOC_SIZE
                if len(new_content_candidate) > MAX_BULKING_DOC_SIZE:
                    break
                content = new_content_candidate

        return {"content": content}

    def build_index(self, force=False):
        """Builds the index.

        Note: since DBpedia only has a few hundred types, no bulk indexing is
        needed.

        :param force: True iff it is required to overwrite the index (i.e. by
        creating it by force); False by default.
        :type force: bool
        :return:
        """
        PLOGGER.info("Building type index {}".format(self.__index_name))
        self.__elastic = Elastic(self.__index_name)
        self.__elastic.create_index(mappings=self.__MAPPINGS, force=force)

        for type_name in self.__types_entities:
            PLOGGER.info("  Adding {} ...".format(type_name))
            contents = self.__make_type_doc(type_name)
            self.__elastic.add_doc(type_name, contents)

        PLOGGER.info("  Done.")