def main(): index_name = "toy_index" mappings = { # "id": Elastic.notanalyzed_field(), "title": Elastic.analyzed_field(), "content": Elastic.analyzed_field(), } docs = { 1: { "title": "Rap God", "content": "gonna, gonna, Look, I was gonna go easy on you and not to hurt your feelings" }, 2: { "title": "Lose Yourself", "content": "Yo, if you could just, for one minute Or one split second in time, forget everything Everything that bothers you, or your problems Everything, and follow me" }, 3: { "title": "Love The Way You Lie", "content": "Just gonna stand there and watch me burn But that's alright, because I like the way it hurts" }, 4: { "title": "The Monster", "content": [ "gonna gonna I'm friends with the monster", "That's under my bed Get along with the voices inside of my head" ] }, 5: { "title": "Beautiful", "content": "Lately I've been hard to reach I've been too long on my own Everybody has a private world Where they can be alone" } } elastic = Elastic(index_name) elastic.create_index(mappings, force=True) elastic.add_docs_bulk(docs) print("index has been built")
class IndexerDBpediaTypes(object): __DOC_TYPE = "doc" # we don't make use of types __MAPPINGS = { # ID_KEY: Elastic.notanalyzed_field(), CONTENT_KEY: Elastic.analyzed_field(), } def __init__(self, config): self.__elastic = None self.__config = config self.__model = config.get("model", Elastic.BM25) self.__index_name = config["index_name"] self.__type2entity_file = config["type2entity_file"] self.__entity_abstracts = {} self.__load_entity_abstracts(config["entity_abstracts_file"]) @property def name(self): return self.__index_name def __load_entity_abstracts(self, filename): prefix = URIPrefix() t = Triple() p = NTriplesParser(t) lines_counter = 0 PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): # basic line parsing line = line.decode("utf-8") if isinstance(line, bytes) else line try: p.parsestring(line) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # Subject and object identification subj = prefix.get_prefixed(t.subject()) obj = "" if type(t.object()) is URIRef: # PLOGGER.error("Error: it is URIRef the parsed obj") pass else: obj = t.object().encode("utf-8") if len(obj) == 0: continue # skip empty objects self.__entity_abstracts[subj] = obj lines_counter += 1 if lines_counter % 10000 == 0: PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass PLOGGER.info("\n### Loading entity abstracts... Done.") def __make_type_doc(self, entities, last_type): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = ABSTRACTS_SEPARATOR.join([self.__entity_abstracts.get(e, b"").decode("utf-8") for e in entities]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info("Type {} has content larger than allowed: {}.".format(last_type, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE amount_abstracts_to_sample = min(floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), len(entities)) entities_sample = [entities[i] for i in sample(range(len(entities)), amount_abstracts_to_sample)] content = "" # reset content for entity in entities_sample: new_content_candidate = (content + ABSTRACTS_SEPARATOR + self.__entity_abstracts.get(entity, b"").decode("utf-8")) # we add an abstract only if by doing so it will not exceed MAX_BULKING_DOC_SIZE if len(new_content_candidate) <= MAX_BULKING_DOC_SIZE: content = new_content_candidate else: break return {CONTENT_KEY: content} def build_index(self, force=False): """Builds the index. :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default. :type force: bool :return: """ self.__elastic = Elastic(self.__index_name) self.__elastic.create_index(mappings=self.__MAPPINGS, force=force) prefix = URIPrefix() # For indexing types in bulk types_bulk = {} # dict from type id to type(=doc) # process type2entity file last_type = None entities = [] lines_counter = 0 types_counter = 0 with FileUtils.open_file_by_type(self.__type2entity_file) as f: for line in f: line = line.decode() # o.w. line is made of bytes if not line.startswith("<"): # bad-formed lines in dataset continue subj, obj = line.rstrip().split() type = prefix.get_prefixed(subj) # subject prefixed entity = prefix.get_prefixed(obj) # use only DBpedia Ontology native types (no bibo, foaf, schema, etc.) if not type.startswith(DBO_PREFIX): continue if last_type is not None and type != last_type: # moving to new type, so: # create a doc for this type, with all the abstracts for its entities, and store it in a bulk types_counter += 1 # PLOGGER.info("\n\tFound {}-th type: {}\t\t with # of entities: {}".format(types_counter, # last_type, # len(entities))) types_bulk[last_type] = self.__make_type_doc(entities, last_type) entities = [] # important to reset it if types_counter % BULK_LEN == 0: # index the bulk of BULK_LEN docs self.__elastic.add_docs_bulk(types_bulk) types_bulk.clear() # NOTE: important to reset it PLOGGER.info("\tIndexing a bulk of {} docs (types)... OK. " "{} types already indexed.".format(BULK_LEN, types_counter)) last_type = type entities.append(entity) lines_counter += 1 if lines_counter % 10000 == 0: # PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass pass # index the last type types_counter += 1 PLOGGER.info("\n\tFound {}-th (last) type: {}\t\t with # of entities: {}".format(types_counter, last_type, len(entities))) types_bulk[last_type] = self.__make_type_doc(entities, last_type) self.__elastic.add_docs_bulk(types_bulk) # a tiny bulk :) # no need to reset neither entities nor types_bulk :P # PLOGGER.info("Indexing a bulk of {} docs (types)... OK.".format(BULK_LEN)) PLOGGER.info("\n### Indexing all {} found docs (types)... Done.".format(types_counter))