def __sample_file(self, dir, file): """Creates a local from a specific file in a given directory. :param dir: directory (relative to path_to_dbpedia) :param file: """ t = Triple() p = NTriplesParser(t) infile = os.path.join(self.path_to_dbpedia, dir, file) outfile = os.path.join(self.output_dir, dir, file) print("Processing file " + file + " ...") i = 0 with FileUtils.open_file_by_type(infile) as fin: fout = FileUtils.open_file_by_type( outfile, mode="w") # output file will be of the same type as the input for line in fin: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue subj = self.prefix.get_prefixed( t.subject()) # prefixing subject if subj in self.sample_entities: fout.write(line) i += 1 if i % 100000 == 0: print(str(i // 1000) + "K lines processed") fout.close()
def __load_entity_abstracts(self, filename): prefix = URIPrefix() t = Triple() p = NTriplesParser(t) lines_counter = 0 PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): # basic line parsing line = line.decode("utf-8") if isinstance(line, bytes) else line try: p.parsestring(line) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # Subject and object identification subj = prefix.get_prefixed(t.subject()) obj = "" if type(t.object()) is URIRef: # PLOGGER.error("Error: it is URIRef the parsed obj") pass else: obj = t.object().encode("utf-8") if len(obj) == 0: continue # skip empty objects self.__entity_abstracts[subj] = obj lines_counter += 1 if lines_counter % 10000 == 0: PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass PLOGGER.info("\n### Loading entity abstracts... Done.")
def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()
def triples(self, xxx_todo_changeme3, context=None): """Generator over the triple store Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ (s, p, o) = xxx_todo_changeme3 url = self._statement_encode((s, p, o), context) req = Request(url) req.add_header('Accept', 'text/plain') # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper = DumpSink() parser = NTriplesParser(dumper) for l in urlopen(req): log.debug('line: %s' % l) parser.parsestring(l) yield dumper.get_triple()
def triples(self, xxx_todo_changeme3, context=None): """Generator over the triple store Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ (s, p, o) = xxx_todo_changeme3 url = self._statement_encode((s, p, o), context) req = Request(url) req.add_header('Accept', 'text/plain') # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper = DumpSink() parser = NTriplesParser(dumper) for l in urlopen(req): log.debug('line: %s' % l) parser.parsestring(l) yield dumper.get_triple()
def transform_part( input_path, global_id_marker, part_name, left, right, prefixer=None, ): print(f'starting {part_name}: {left} -- {right}') with open(input_path, 'rb') as in_file: in_file.seek(left) part_bytes = in_file.read(right - left) part_str = part_bytes.decode('utf8') # wasteful with PropertyGraphSink(global_id_marker, part_name, prefixer) as sink: ntp = NTriplesParser(sink=sink) ntp.parsestring(part_str) triple_count = sum(sink.predicate_count.values()) print(f'finished {part_name}: {triple_count} triples') return part_name, dict(sink.predicate_count)
def parse_file(self, filename, triplehandler): """Parses file and calls callback function with the parsed triple""" PLOGGER.info("Processing " + filename + "...") prefix = URIPrefix() t = Triple(prefix) p = NTriplesParser(t) i = 0 with open(filename) as f: for line in f: p.parsestring(line) if t.subject() is None: # only if parsed as a triple continue # call the handler object with the parsed triple triplehandler.triple_parsed(t) i += 1 if i % 10000 == 0: PLOGGER.info(str(i / 1000) + "K lines processed")
def read_fb2dbp_file(self, is_39=False): """Reads the file and generates an initial mapping of Freebase to DBpedia IDs. Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored. """ fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file print("Processing " + fb2dbp_file + "...") t = Triple() p = NTriplesParser(t) i = 0 fb2dbp_mapping = defaultdict(set) with FileUtils.open_file_by_type(fb2dbp_file) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # prefixing dbp_id = self.__prefix.get_prefixed(t.subject()) fb_id = self.__prefix.get_prefixed(t.object()) # if reading 3.9 file, converts ID to 2015-10 version if is_39: dbp_id = EntityUtils.convert_39_to_201510(dbp_id) fb2dbp_mapping[fb_id].add(dbp_id) # if reading 2015-10 file, keeps only the proper DBpedia entities else: entity_utils = EntityUtils( self.__mongo_dbpedia.find_by_id(dbp_id)) if entity_utils.is_entity(): fb2dbp_mapping[fb_id].add(dbp_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K lines are processed!") return fb2dbp_mapping
def triples(self, (s, p, o), context=None): """Generator over the triple store Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ url = self._statement_encode((s, p, o), context) req = Request(url) req.add_header('Accept','text/plain') # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper=DumpSink() parser=NTriplesParser(dumper) for l in urlopen(req): log.debug('line: %s'%l) parser.parsestring(l) yield dumper.get_triple() def __len__(self): """Returns the number of triples in the graph calls http://{self.url}/size very fast """ return int(urlopen(self.url+"/size").read()) def set(self, (subject, predicate, object)): """Convenience method to update the value of object Remove any existing triples for subject and predicate before adding (subject, predicate, object). """ self.remove((subject, predicate, None))
class IndexerDBpediaTypes(object): __DOC_TYPE = "doc" # we don't make use of types __MAPPINGS = { "id": Elastic.notanalyzed_field(), "content": Elastic.analyzed_field(), } def __init__(self, config): self.__elastic = None self.__config = config self.__index_name = config["index_name"] self.__dbpedia_path = config["dbpedia_files_path"] # For triple parsing self.__prefix = URIPrefix() self.__triple = Triple() self.__ntparser = NTriplesParser(self.__triple) # Entity abstract and type assignments kept in memory self.__entity_abstracts = {} self.__load_entity_abstracts() self.__types_entities = defaultdict(list) self.__load_entity_types() @property def name(self): return self.__index_name def __parse_line(self, line): """Parses a line from a ttl file and returns subject and object pair. It is used for parsing DBpedia abstracts and entity types. The subject is always prefixed. For object URIs, it is returned prefixed if from DBpedia otherwise None (i.e., types); literal objects are always returned (i.e., abstracts). """ line = line.decode("utf-8") if isinstance(line, bytes) else line try: self.__ntparser.parsestring(line) except ParseError: # skip lines that couldn't be parsed return None, None if self.__triple.subject() is None: # only if parsed as a triple return None, None subj = self.__prefix.get_prefixed(self.__triple.subject()) obj = None if type(self.__triple.object()) is URIRef: if self.__triple.object().startswith( "http://dbpedia.org/ontology"): obj = self.__prefix.get_prefixed(self.__triple.object()) else: obj = self.__triple.object().encode("utf-8") return subj, obj def __load_entity_abstracts(self): num_lines = 0 filename = os.sep.join([self.__dbpedia_path, ENTITY_ABSTRACTS_FILE]) PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, abstract = self.__parse_line(line) if abstract and len(abstract) > 0: # skip empty objects self.__entity_abstracts[entity] = abstract num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __load_entity_types(self): num_lines = 0 for types_file in ENTITY_TYPES_FILES: filename = os.sep.join([self.__dbpedia_path, types_file]) PLOGGER.info("Loading entity types from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): entity, entity_type = self.__parse_line(line) if type(entity_type) != str: # Likely result of parsing error continue if not entity_type.startswith("<dbo:"): PLOGGER.info(" Non-DBpedia type: {}".format(entity_type)) continue if not entity.startswith("<dbpedia:"): PLOGGER.info(" Invalid entity: {}".format(entity)) continue self.__types_entities[entity_type].append(entity) num_lines += 1 if num_lines % 10000 == 0: PLOGGER.info(" {}K lines processed".format(num_lines // 1000)) PLOGGER.info(" Done.") def __make_type_doc(self, type_name): """Gets the document representation of a type to be indexed, from its entity short abstracts.""" content = "\n".join([ self.__entity_abstracts.get(e, b"").decode("utf-8") for e in self.__types_entities[type_name] ]) if len(content) > MAX_BULKING_DOC_SIZE: PLOGGER.info("Type {} has content larger than allowed: {}.".format( type_name, len(content))) # we randomly sample a subset of Y entity abstracts, s.t. # Y * AVG_SHORT_ABSTRACT_LEN <= MAX_BULKING_DOC_SIZE num_entities = len(self.__types_entities[type_name]) amount_abstracts_to_sample = min( floor(MAX_BULKING_DOC_SIZE / AVG_SHORT_ABSTRACT_LEN), num_entities) entities_sample = [ self.__types_entities[type_name][i] for i in sample( range(num_entities), amount_abstracts_to_sample) ] content = "" # reset content for entity in entities_sample: new_content_candidate = "\n".join([ content, self.__entity_abstracts.get(entity, b"").decode("utf-8") ]) # we add an abstract only if by doing so it will not exceed # MAX_BULKING_DOC_SIZE if len(new_content_candidate) > MAX_BULKING_DOC_SIZE: break content = new_content_candidate return {"content": content} def build_index(self, force=False): """Builds the index. Note: since DBpedia only has a few hundred types, no bulk indexing is needed. :param force: True iff it is required to overwrite the index (i.e. by creating it by force); False by default. :type force: bool :return: """ PLOGGER.info("Building type index {}".format(self.__index_name)) self.__elastic = Elastic(self.__index_name) self.__elastic.create_index(mappings=self.__MAPPINGS, force=force) for type_name in self.__types_entities: PLOGGER.info(" Adding {} ...".format(type_name)) contents = self.__make_type_doc(type_name) self.__elastic.add_doc(type_name, contents) PLOGGER.info(" Done.")
Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ ctx = context or self.context url = self._statement_encode((s, p, o), ctx) req = Request(url) req.add_header( 'Accept', 'text/plain' ) # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper = DumpSink() parser = NTriplesParser(dumper) for l in urlopen(req): #log.debug('line: %s'%l) parser.parsestring(l) yield dumper.get_triple() def __len__(self): """Returns the number of triples in the graph calls http://{self.url}/size very fast """ return int(urlopen(self.url + "/size").read()) def set(self, (subject, predicate, object)): """Convenience method to update the value of object Remove any existing triples for subject and predicate before adding (subject, predicate, object). """ self.remove((subject, predicate, None))