コード例 #1
0
def get_rdf_reader(file_path, format='nt'):
    """Get an iterator over RDF triples from a file"""
    iterator = None
    nb_triples = 0
    # load using rdflib
    if format == 'ttl':
        g = Graph()
        g.parse(file_path, format=format)
        nb_triples = len(g)
        iterator = map(__n3_to_str, g.triples((None, None, None)))
    elif format == 'nt':
        print('Counting triples using the wc command...')
        total = wccount(file_path)
        print('The file contains {} triples.'.format(total))
        f = open(file_path, 'r')
        iter = yield_triples(f)
        return iter, total, f

    elif format == 'hdt':
        # load HDTDocument without additional indexes (not needed since we do a ?s ?p ?o)
        doc = HDTDocument(file_path, True, True)
        iterator, nb_triples = doc.search_triples_bytes("", "", "")
    return iterator, nb_triples
コード例 #2
0
import random
from tarjan import tarjan
from collections import Counter



PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt"
hdt_file = HDTDocument(PATH_LOD)

subclass = "http://www.w3.org/2000/01/rdf-schema#subClassOf"
rdfsClass = "http://www.w3.org/2000/01/rdf-schema#Class"
owlClass = "http://www.w3.org/2002/07/owl#Class"
eqClass = "http://www.w3.org/2002/07/owl#equivalentClass"
type = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"

(triples, cardi1) = hdt_file.search_triples_bytes("", type, "")
print ('there are in total ', cardi1, ' triples')

count = 0
ct = Counter()
for (_,_, t) in triples:
	count += 1
	if count %1000000 == 0:
		print (count , ', processed. That makes ',count / cardi1)
	try:
		t = t.decode('UTF-8')
	except UnicodeDecodeError as err:
		t = str(t, errors='ignore')
	ct[t] += 1

print (ct.most_common(100))