def get_rdf_reader(file_path, format='nt'): """Get an iterator over RDF triples from a file""" iterator = None nb_triples = 0 # load using rdflib if format == 'ttl': g = Graph() g.parse(file_path, format=format) nb_triples = len(g) iterator = map(__n3_to_str, g.triples((None, None, None))) elif format == 'nt': print('Counting triples using the wc command...') total = wccount(file_path) print('The file contains {} triples.'.format(total)) f = open(file_path, 'r') iter = yield_triples(f) return iter, total, f elif format == 'hdt': # load HDTDocument without additional indexes (not needed since we do a ?s ?p ?o) doc = HDTDocument(file_path, True, True) iterator, nb_triples = doc.search_triples_bytes("", "", "") return iterator, nb_triples
import random from tarjan import tarjan from collections import Counter PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt" hdt_file = HDTDocument(PATH_LOD) subclass = "http://www.w3.org/2000/01/rdf-schema#subClassOf" rdfsClass = "http://www.w3.org/2000/01/rdf-schema#Class" owlClass = "http://www.w3.org/2002/07/owl#Class" eqClass = "http://www.w3.org/2002/07/owl#equivalentClass" type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" (triples, cardi1) = hdt_file.search_triples_bytes("", type, "") print ('there are in total ', cardi1, ' triples') count = 0 ct = Counter() for (_,_, t) in triples: count += 1 if count %1000000 == 0: print (count , ', processed. That makes ',count / cardi1) try: t = t.decode('UTF-8') except UnicodeDecodeError as err: t = str(t, errors='ignore') ct[t] += 1 print (ct.most_common(100))