def __load_entity_abstracts(self, filename): prefix = URIPrefix() t = Triple() p = NTriplesParser(t) lines_counter = 0 PLOGGER.info("Loading entity abstracts from {}".format(filename)) for line in FileUtils.read_file_as_list(filename): # basic line parsing line = line.decode("utf-8") if isinstance(line, bytes) else line try: p.parsestring(line) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # Subject and object identification subj = prefix.get_prefixed(t.subject()) obj = "" if type(t.object()) is URIRef: # PLOGGER.error("Error: it is URIRef the parsed obj") pass else: obj = t.object().encode("utf-8") if len(obj) == 0: continue # skip empty objects self.__entity_abstracts[subj] = obj lines_counter += 1 if lines_counter % 10000 == 0: PLOGGER.info("\t{}K lines processed".format(lines_counter // 1000)) pass PLOGGER.info("\n### Loading entity abstracts... Done.")
def __sample_file(self, dir, file): """Creates a local from a specific file in a given directory. :param dir: directory (relative to path_to_dbpedia) :param file: """ t = Triple() p = NTriplesParser(t) infile = os.path.join(self.path_to_dbpedia, dir, file) outfile = os.path.join(self.output_dir, dir, file) print("Processing file " + file + " ...") i = 0 with FileUtils.open_file_by_type(infile) as fin: fout = FileUtils.open_file_by_type( outfile, mode="w") # output file will be of the same type as the input for line in fin: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue subj = self.prefix.get_prefixed( t.subject()) # prefixing subject if subj in self.sample_entities: fout.write(line) i += 1 if i % 100000 == 0: print(str(i // 1000) + "K lines processed") fout.close()
def parse(self, filename: str = None, input_format: str = None, provided_by: str = None, predicates: Set[URIRef] = None) -> None: """ Parse a n-triple file into networkx.MultiDiGraph The file must be a *.nt formatted file. Parameters ---------- filename : str File to read from. input_format : str The input file format. Must be one of ``['nt', 'nt.gz']`` provided_by : str Define the source providing the input file. """ p = p = NTriplesParser(self) self.start = current_time_in_millis() if input_format == INPUT_FORMATS[0]: p.parse(open(filename, 'rb')) elif input_format == INPUT_FORMATS[1]: p.parse(gzip.open(filename, 'rb')) else: raise NameError( f"input_format: {input_format} not supported. Must be one of {INPUT_FORMATS}" ) print("Done parsing NT file") self.dereify(self.assocs)
def add_file(self, filename, reverse_triple=False, predicate_prefix=None): """Adds contents from an NTriples file to MongoDB. :param filename: NTriples file. :param reverse_triple: if set True, the subject and object values are swapped. :param predicate_prefix: prefix to be added to predicates. :param subjects_redirecter: redirects dict. """ print("Processing " + filename + "...") t = Triple() p = NTriplesParser(t) self.__m_id = None # document id for MongoDB -- subj self.__m_contents = None # document contents for MongoDB -- pred, obj i = 0 with FileUtils.open_file_by_type(filename) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # subject prefixing subj = self.__prefix.get_prefixed(t.subject()) # predicate prefixing pred = self.__prefix.get_prefixed(t.predicate()) if predicate_prefix is not None: pred = predicate_prefix + pred # Object prefixing if type(t.object()) is URIRef: obj = self.__prefix.get_prefixed(t.object()) else: obj = t.object() if len(obj) == 0: continue # skip empty objects # write or append if reverse_triple: # reverse subj and obj self._next_triple(obj, pred, subj) else: # normal mode self._next_triple(subj, pred, obj) i += 1 if i % 100000 == 0: print( str(i // 1000) + "K lines processed from " + filename) # process last triple self._write_to_mongo()
def axioms(filename): stream = Stream() parser = NTriplesParser(stream) with open(filename, "rb") as data: parser.parse(data) graph = stream.graph() propertise = set() for triple in graph: propertise.add(triple[1]) print(propertise) dic_fun = functionality(graph, propertise) print('1') dic_ref = reflexivity(graph, propertise) print('2') dic_sym = symmetry(graph, propertise) print('3') dic_inv = inverse(graph, propertise) print('4') dic_tra = transivity(graph, propertise) print('5') csvname = filename + '.csv' out = open(csvname, 'a', newline='') csv_writer = csv.writer(out, dialect='excel') for pi in propertise: l1 = [pi] if (dic_fun[pi] > 0): l1.append('functionality') l1.append(dic_fun[pi]) if (dic_ref[pi] == 1): l1.append('reflexivity') if (dic_sym[pi] == 1): l1.append('symmetry') if (len(dic_inv[pi]) != 0): l1.append('inverse') if (dic_tra[pi] == 1): l1.append('transivity') print(l1) csv_writer.writerow(l1) print('over')
def __init__(self, config): self.__elastic = None self.__config = config self.__index_name = config["index_name"] self.__dbpedia_path = config["dbpedia_files_path"] # For triple parsing self.__prefix = URIPrefix() self.__triple = Triple() self.__ntparser = NTriplesParser(self.__triple) # Entity abstract and type assignments kept in memory self.__entity_abstracts = {} self.__load_entity_abstracts() self.__types_entities = defaultdict(list) self.__load_entity_types()
def parse(self, source, sink, **kwargs): ''' Parse the NT format :type source: `rdflib.parser.InputSource` :param source: the source of NT-formatted data :type sink: `rdflib.graph.Graph` :param sink: where to send parsed triples :param kwargs: Additional arguments to pass to `.NTriplesParser.parse` ''' f = source.getByteStream() # TODO getCharacterStream? parser = NTriplesParser(NTSink(sink)) parser.parse(f, **kwargs) f.close()
def process_file(infile, sink): bad_lines = defaultdict(int) for line in infile: s = BytesIO() s.write(line) s.seek(0) parser = NTriplesParser(sink) try: parser.parse(s) except (ParseError, ElementStrError) as e: bad_lines[line] += 1 print('read {} lines from {}'.format(sink.nlines, infile.name)) print('bad lines and their frequencies:') for line, count in bad_lines.items(): print(' {:>10} : {}'.format(count, line))
def load(cls, filepath): """Return array of FAST dict. Main method.""" if zipfile.is_zipfile(filepath): with ZipFile(filepath) as zf: nt_filename = next( (n for n in zf.namelist() if n.endswith('.nt'))) # defaults to equivalent of 'rb' nt_file = zf.open(nt_filename) else: nt_file = open(filepath, 'rb') instance = cls() parser = NTriplesParser(instance) parser.parse(nt_file) nt_file.close() return instance.terms
def triples(self, xxx_todo_changeme3, context=None): """Generator over the triple store Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ (s, p, o) = xxx_todo_changeme3 url = self._statement_encode((s, p, o), context) req = Request(url) req.add_header('Accept', 'text/plain') # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper = DumpSink() parser = NTriplesParser(dumper) for l in urlopen(req): log.debug('line: %s' % l) parser.parsestring(l) yield dumper.get_triple()
def transform_part( input_path, global_id_marker, part_name, left, right, prefixer=None, ): print(f'starting {part_name}: {left} -- {right}') with open(input_path, 'rb') as in_file: in_file.seek(left) part_bytes = in_file.read(right - left) part_str = part_bytes.decode('utf8') # wasteful with PropertyGraphSink(global_id_marker, part_name, prefixer) as sink: ntp = NTriplesParser(sink=sink) ntp.parsestring(part_str) triple_count = sum(sink.predicate_count.values()) print(f'finished {part_name}: {triple_count} triples') return part_name, dict(sink.predicate_count)
def parse_file(self, filename, triplehandler): """Parses file and calls callback function with the parsed triple""" PLOGGER.info("Processing " + filename + "...") prefix = URIPrefix() t = Triple(prefix) p = NTriplesParser(t) i = 0 with open(filename) as f: for line in f: p.parsestring(line) if t.subject() is None: # only if parsed as a triple continue # call the handler object with the parsed triple triplehandler.triple_parsed(t) i += 1 if i % 10000 == 0: PLOGGER.info(str(i / 1000) + "K lines processed")
def read_fb2dbp_file(self, is_39=False): """Reads the file and generates an initial mapping of Freebase to DBpedia IDs. Only proper DBpedia entities are considered; i.e. redirect and disambiguation pages are ignored. """ fb2dbp_file = self.__fb2dbp_file_39 if is_39 else self.__fb2dbp_file print("Processing " + fb2dbp_file + "...") t = Triple() p = NTriplesParser(t) i = 0 fb2dbp_mapping = defaultdict(set) with FileUtils.open_file_by_type(fb2dbp_file) as f: for line in f: try: p.parsestring(line.decode("utf-8")) except ParseError: # skip lines that couldn't be parsed continue if t.subject() is None: # only if parsed as a triple continue # prefixing dbp_id = self.__prefix.get_prefixed(t.subject()) fb_id = self.__prefix.get_prefixed(t.object()) # if reading 3.9 file, converts ID to 2015-10 version if is_39: dbp_id = EntityUtils.convert_39_to_201510(dbp_id) fb2dbp_mapping[fb_id].add(dbp_id) # if reading 2015-10 file, keeps only the proper DBpedia entities else: entity_utils = EntityUtils( self.__mongo_dbpedia.find_by_id(dbp_id)) if entity_utils.is_entity(): fb2dbp_mapping[fb_id].add(dbp_id) i += 1 if i % 1000 == 0: print(str(i // 1000) + "K lines are processed!") return fb2dbp_mapping
def classPart(filename): stream = Stream() parser = NTriplesParser(stream) with open(filename,"rb") as data: parser.parse(data) graph = stream.graph() # garaph(set) is the dataset <s1,p1,o1> <s2,p2,o2>... print('success load') classes = {} c = set() p = set() for triple in graph: c.add(triple[2]) p.add(triple[0]) if triple[2] not in classes: classes[triple[2]] = set() classes[triple[2]].add(triple[0]) print('the number of classes: ',end='') print(len(c)) print('the number of instances: ',end='') print(len(p)) return classes
def _parse(s): n = NTriplesParser() n.line = s return n.object()
if data: data = set(json.loads(data)) for w in v: data.add(w) if data: self.db.put(k, json.dumps(list(data))) logging.info('categories: {0} => {1}'.format( unquote(k), list(data)[:5])) if __name__ == '__main__': labels = NTriplesParser(sink=Label('./labels')) categories = NTriplesParser( sink=Category('./categories', labels=labels.sink.db)) def process_labels(line): labels.parsestring(line) for filename in [ './labels_en.nt', './labels_en_uris_id.nt', './category_labels_en.nt', './category_labels_en_uris_id.nt' ]: logging.info('labels: processing: {0}'.format(filename)) Parallel(n_jobs=N_JOBS)(delayed(process_labels)(line) for line in open(filename)) def process_categories(line):
try: c.execute("INSERT INTO entities (entity) VALUES (?)", [(s)]) s_id = c.lastrowid except sqlite3.IntegrityError: c.execute("SELECT id FROM entities WHERE entity = ?", [(s)]) s_id = c.fetchone()[0] try: c.execute("INSERT INTO properties (property) VALUES (?)", [(p)]) p_id = c.lastrowid except sqlite3.IntegrityError: c.execute("SELECT id FROM properties WHERE property = ?", [(p)]) p_id = c.fetchone()[0] try: c.execute("INSERT INTO entities (entity) VALUES (?)", [(o)]) o_id = c.lastrowid except sqlite3.IntegrityError: c.execute("SELECT id FROM entities WHERE entity = ?", [(o)]) o_id = c.fetchone()[0] # print "{} {} {}".format(s_id, p_id, o_id) if __name__ == "__main__": # Create a new parser and try to parse the NT file. sk = StreamSink() n = NTriplesParser(sk) with open(sys.argv[1], "r") as anons: n.parse(anons) conn.commit() conn.close() print "triples = {}".format(sk.length)
dic_dis[triple[2]].add(triple[0]) return dic_dis # input the filepath datasets = Filelist('E:/python/ttldata') print(datasets) dic1 = {} for filename in datasets: stream = Stream() parser = NTriplesParser(stream) with open(filename, "rb") as data: parser.parse(data) graph = stream.graph() ChoiceType(graph, dic1) print(dic1) for i in dic1: for j in dic1: if i == j: continue if dic1[i] & dic1[j]: print(i, ' and ', j, ' aren\'t disjunction') else:
if not self.labels: raise Exception('Labels not set.') def triple(self, s, p, o): k = o.encode('utf-8') s = s.encode('utf-8') for v in self.labels.get(s, default=set()): self.store[k].add(v) logging.info('categories: {0} => {1}'.format( unquote(k), list(self.store.get(k))[:5])) if __name__ == '__main__': labels = NTriplesParser(sink=Label()) def process_labels(line): labels.parsestring(line) for filename in [ './labels_en.nt', './labels_en_uris_id.nt', './category_labels_en.nt', './category_labels_en_uris_id.nt' ]: logging.info('labels: processing: {0}'.format(filename)) Parallel(n_jobs=N_JOBS)(delayed(process_labels)(line) for line in open(filename)) pickle.dump(labels.sink.store, open('labels.p', 'wb')) categories = NTriplesParser(sink=Category(labels=labels.sink.store))
def load( dump_file: 'url of the Freebase RDF dump', mid_textid_file: 'url of the part of the Freebase RDF dump containing type.object.id relations' ): engine = create_engine(get_db_url(), pool_recycle=3600) Base.metadata.create_all(engine) def execute_select(statement, **args): db = engine.connect() try: for row in db.execute(statement, **args): yield row except OperationalError: db.close() db = engine.connect() for row in db.execute(statement, **args): yield row finally: db.close() def execute_edit(statement, **args): db = engine.connect() try: return db.execute(statement, **args) except OperationalError: db.close() db = engine.connect() return db.execute(statement, **args) finally: db.close() @lru_cache(maxsize=4096) def get_topic_id_from_url(url: str) -> Optional[int]: input_id = url.replace('http://rdf.freebase.com/ns', '').replace('.', '/') if input_id.startswith('/m/') or input_id.startswith('/g/'): for topic in execute_select( Topic.__table__.select(Topic.mid == input_id)): return topic[0] return execute_edit(insert_query(Topic), mid=input_id).inserted_primary_key[0] else: if len(input_id) > MAX_VARCHAR_SIZE: return None for topic in execute_select( Topic.__table__.select(Topic.textid == input_id)): return topic[0] try: return execute_edit(insert_query(Topic), textid=input_id).inserted_primary_key[0] except IntegrityError as e: logger.error(e) return None def add_to_language_column(table, s, label, max_size): s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning( 'Not able to get mid for label subject {}'.format(s)) return if len(label) >= max_size: logger.error('Not able to add too long label: {}'.format(label)) return try: execute_edit(insert_query(table), topic_id=s_topic_id, language=label.language, value=label.value) except IntegrityError: pass # We do not care about duplicates def add_type(s, o, notable): s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning('Not able to get mid for type subject {}'.format(s)) return o_topic_id = get_topic_id_from_url(o) if o_topic_id is None: logger.warning('Not able to get mid for type object {}'.format(o)) return try: execute_edit(insert_query(Type), topic_id=s_topic_id, type_id=o_topic_id, notable=notable) except IntegrityError: if notable: # We add notability execute_edit(Type.__table__.update().where( Type.topic_id == s_topic_id).where( Type.type_id == o_topic_id).values(notable=notable)) def add_key(s, key): if not is_interesting_key(key): return False s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning('Not able to get mid for key {}'.format(s)) return key = decode_key(key) if len(key) >= MAX_VARCHAR_SIZE: logger.error('Not able to add too long key: {}'.format(key)) return try: execute_edit(insert_query(Key), topic_id=s_topic_id, key=decode_key(key)) except IntegrityError: pass def add_property_topic_id_field(field_name, s, o): s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning('Not able to get mid for key {}'.format(s)) return o_topic_id = get_topic_id_from_url(o) if o_topic_id is None: logger.warning('Not able to get mid for key {}'.format(s)) return try: execute_edit(insert_query(Property), topic_id=s_topic_id, **{field_name: o_topic_id}) except IntegrityError: execute_edit( update_query(Property).values(**{ field_name: o_topic_id }).where(Property.topic_id == s_topic_id)) def add_unique(s, o): s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning('Not able to get mid for key {}'.format(s)) return try: execute_edit(insert_query(Property), unique=to_bool(o), topic_id=s_topic_id) except IntegrityError: execute_edit( update_query(Property).values(unique=to_bool(o)).where( Property.topic_id == s_topic_id)) def add_edge(s, p, o): s_topic_id = get_topic_id_from_url(s) if s_topic_id is None: logger.warning('Not able to get mid for key {}'.format(s)) return p_topic_id = get_topic_id_from_url(p) if p_topic_id is None: logger.warning('Not able to get mid for key {}'.format(p)) return o_topic_id = get_topic_id_from_url(o) if o_topic_id is None: logger.warning('Not able to get mid for key {}'.format(o)) return try: execute_edit(insert_query(Edge), subject_id=s_topic_id, predicate_id=p_topic_id, object_id=o_topic_id) except IntegrityError: pass def to_bool(s): s = str(s) if s == 'true': return True elif s == 'false': return False else: raise ValueError("Unexpected value: '{}'".format(s)) class TextIdSink: def triple(self, s, p, o): if p == type_object_id: s = s.replace('http://rdf.freebase.com/ns', '').replace('.', '/') o = o.replace('http://rdf.freebase.com/ns', '').replace('.', '/') try: execute_edit(insert_query(Topic), mid=s, textid=o) except IntegrityError: pass else: logger.info('Unexpected triple: {} {} {}'.format(s, p, o)) class TripleSink: def __init__(self, start_cursor=0): self.cursor = start_cursor def triple(self, s, p, o): self.cursor += 1 if self.cursor % 1000000 == 0: print(self.cursor) with open('progress.txt', 'wt') as pfp: pfp.write(str(self.cursor)) try: """ if p == type_object_name: add_to_language_column(Label, s, o, MAX_VARCHAR_SIZE) elif p == common_topic_description: add_to_language_column(Description, s, o, sys.maxsize) elif p == common_topic_alias: add_to_language_column(Alias, s, o, MAX_VARCHAR_SIZE) elif p == type_object_type: add_type(s, o, False) elif p == common_topic_notable_types: add_type(s, o, True) elif p == type_object_key: add_key(s, o.value) if p == type_property_schema: add_property_topic_id_field('schema_id', s, o) elif p == type_property_expected_type: add_property_topic_id_field('expected_type_id', s, o) elif p == type_property_unique: add_unique(s, o) elif p == type_property_master_property: add_property_topic_id_field('master_id', s, o) elif p == type_property_reverse_property: add_property_topic_id_field('reverse_id', s, o) elif p == type_property_unit: add_property_topic_id_field('unit_id', s, o) elif p == type_property_delegated: add_property_topic_id_field('delegated_id', s, o) """ if isinstance(o, URIRef) and o.startswith( 'http://rdf.freebase.com/') and p.startswith( 'http://rdf.freebase.com/ns/') and not any( b in p for b in edge_blacklist): add_edge(s, p, o) except ValueError: pass with gzip.open(mid_textid_file) as fp: NTriplesParser(sink=TextIdSink()).parse(fp) with gzip.open(dump_file) as fp: cursor = 0 progress = Path('progress.txt') if progress.is_file(): with progress.open('rt') as fpc: cursor = int(fpc.read().strip()) logger.info('Skipping the first {} lines'.format(cursor)) for _ in range(cursor): fp.readline() NTriplesParser(sink=TripleSink(cursor)).parse(fp)
def triples(self, (s, p, o), context=None): """Generator over the triple store Returns triples that match the given triple pattern. If triple pattern does not provide a context, all contexts will be searched. """ ctx = context or self.context url = self._statement_encode((s, p, o), ctx) req = Request(url) req.add_header( 'Accept', 'text/plain' ) # N-Triples is best for generator (one line per triple) log.debug("Request: %s" % req.get_full_url()) dumper = DumpSink() parser = NTriplesParser(dumper) for l in urlopen(req): #log.debug('line: %s'%l) parser.parsestring(l) yield dumper.get_triple() def __len__(self): """Returns the number of triples in the graph calls http://{self.url}/size very fast """ return int(urlopen(self.url + "/size").read()) def set(self, (subject, predicate, object)): """Convenience method to update the value of object
try: p_id = props[p][0] except KeyError: props[p] = (i, open("matrices/" + str(i), "w+")) props[p][1].write("%%MatrixMarket matrix coordinate integer general\n%\nnum_ents num_ents num_nonZeros\n") p_id = i i += 1 c.execute("SELECT id FROM entities WHERE entity = ?", [(s)]) s_id = c.fetchone()[0] c.execute("SELECT id FROM entities WHERE entity = ?", [(o)]) o_id = c.fetchone()[0] props[p][1].write(("{} {} 1" + "\n").format(s_id, o_id)) TSink = TensorSink() g = NTriplesParser(TSink) f = open("test.ttl", 'rb') g.parse(f) f.close() conn.commit() c.execute("SELECT count(*) FROM entities") num_ents = c.fetchone()[0] #close all writers and the database connection for key, value in props.items(): value[1].close() conn.close() #create .mtx for all properties with proper head fields for key, value in props.items(): id_p = str(value[0])
def parse(self, source, sink, baseURI=None): f = source.getByteStream() # TODO getCharacterStream? parser = NTriplesParser(NTSink(sink)) parser.parse(f) f.close()