def test_parse_shared_bnode_context_same_graph(self): bnode_ctx = dict() g = ConjunctiveGraph() g.parse(self.data_obnodes, format="nquads", bnode_context=bnode_ctx) o1 = set(g.objects()) self.data_obnodes.seek(0) g.parse(self.data_obnodes, format="nquads", bnode_context=bnode_ctx) o2 = set(g.objects()) self.assertEqual(o1, o2)
class ConstraintParser(object): def __init__(self, input_graphs): self.root = Node(OWL.Thing, []) self.g = ConjunctiveGraph() self.journal = {OWL.Thing: self.root} for path in input_graphs: print('Loading ', path) self.g.load(path, format='turtle') def get_node(self, uri): if uri in self.journal: return self.journal[uri] else: logging.debug('Creating node for : ' + unicode(uri)) new_node = Node(uri, []) self.journal[uri] = new_node return new_node def get_all_children(self, uri): children = self.get_node(uri).children nodes_to_visit = children.copy() result = children.copy() childs_visited = 0 while (len(nodes_to_visit) > 0): current_node = nodes_to_visit.pop() nodes_to_visit = nodes_to_visit.union(current_node.children) result.add(current_node) childs_visited += 1 logging.debug("Child : " + str(childs_visited) + ' ' + unicode(current_node)) return result def parse_hierarchy(self): types = self.g.objects(None, RDF.type) all_super_classes = set() # first find upper classes under root for t in types: super_classes = list(self.g.objects(t, RDFS.subClassOf)) all_super_classes = all_super_classes.union(set(super_classes)) logging.debug( unicode(t) + ' has super classes: ' + ''.join(super_classes)) if len(super_classes) == 0 or (len(super_classes) == 1 and super_classes[0] == OWL.Thing): # super_class is root tmp_node = self.get_node(t) # Node(t, [self.root]) tmp_node.parents.add(self.root) self.root.children.add(tmp_node) else: tmp_node = self.get_node(t) tmp_node.parents.union( set([self.get_node(p) for p in super_classes])) for p in super_classes: self.get_node(p).children.add(tmp_node) logging.debug(all_super_classes)
def test_null_values_with_single_string(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.single.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") # There should be no subject NA all_subjects = {x for x in g.subjects()} assert subj_ns['null_key'] not in all_subjects assert subj_ns['1'] in all_subjects assert len(all_subjects) == 4 # Null valued objects should not be created all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects assert Literal('PUBLIC') in all_objects assert Literal('12', datatype=XSD.token) in all_objects # Spot check some triples do not exist but other do from the same row null_key_lit = Literal('null_id', datatype=XSD.token) assert len(list(g.triples((subj_ns['2'], id_uri, null_key_lit)))) == 0 priv_lit = Literal('PRIVATE') assert len(list(g.triples((subj_ns['2'], sect_uri, priv_lit)))) == 1 null_sector_lit = Literal('null_sector') assert len(list(g.triples((subj_ns['3'], sect_uri, null_sector_lit)))) == 0 twelve_lit = Literal('12', datatype=XSD.token) assert len(list(g.triples((subj_ns['3'], id_uri, twelve_lit)))) == 1
def verify_rdf(rdf_output): g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") assert len(g) == 6 assert len(set(g.subjects())) == 2 assert len(set(g.predicates())) == 3 assert len(set(g.objects())) == 6
def test_remove_period(self): with open(filepath('test-patch-remove-period.json')) as f: patch1 = f.read() with self.client as client: res = client.patch( '/d/', data=patch1, content_type='application/json', headers={'Authorization': 'Bearer ' + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'}) patch_url = urlparse(res.headers['Location']).path res = client.post( patch_url + 'merge', buffered=True, headers={'Authorization': 'Bearer ' + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'}) self.assertEqual(res.status_code, http.client.NO_CONTENT) removed_entities = database.get_removed_entity_keys() self.assertEqual(removed_entities, set(['p0trgkvwbjd'])) res = client.get('/trgkvwbjd', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.GONE) res = client.get('/trgkvwbjd.json', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.GONE) res = client.get('/trgkvwbjd?version=0', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.NOT_FOUND) res = client.get('/trgkvwbjd.json?version=0', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.NOT_FOUND) res = client.get('/trgkvwbjd?version=1', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.OK) res = client.get('/trgkvwbjd.json?version=1', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.OK) res = client.get('/history.jsonld?inline-context') self.assertEqual( res.headers['Cache-Control'], 'public, max-age=0') self.assertEqual( res.headers['X-Accel-Expires'], '{}'.format(cache.MEDIUM_TIME)) g = ConjunctiveGraph() g.parse(format='json-ld', data=res.get_data(as_text=True)) generated = list(g.objects(subject=HOST['h#change-2'], predicate=PROV.generated)) self.assertEqual(len(generated), 1) self.assertIn(HOST['d?version=2'], generated)
def get_mediator_details(userid): #Get mediator_details - firstname, lastname, department, email details = {} details['userid'] = userid details['uri'] = None details['name'] = None details['fname'] = None details['lname'] = None details['title'] = None details['email'] = None details['dept'] = [] if userid.startswith('uuid'): userid = get_mediator_account(userid) details['userid'] = userid if not userid: return details if not os.path.isfile(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)): return details graph = Graph() graph.parse(os.path.join(ag.mediatorsdir, '%s.rdf'%userid)) t = '' f = '' l = '' for title in graph.objects(None, namespaces['foaf']['title']): if title.strip(): t = title details['title'] = t for fname in graph.objects(None, namespaces['foaf']['firstName']): if fname.strip(): f = fname details['fname'] = fname for lname in graph.objects(None, namespaces['foaf']['lastName']): if lname.strip(): l = lname details['lname'] = lname details['name'] = "%s %s %s"%(t, f, l) details['name'] = details['name'].strip() if not details['name']: details['name'] = userid for email in graph.objects(None, namespaces['foaf']['mbox']): details['email'] = email for dept in graph.objects(None, namespaces['dcterms']['isPartOf']): details['dept'].append(dept) for uri in graph.subjects(namespaces['foaf']['account'], None): details['uri'] = uri return details
def get_uri_types(uri, lang): g = ConjunctiveGraph('SPARQLStore') g.open(get_dbpedia_endpoint(lang)) #print uri #print len(list( g.triples(( URIRef(uri), URIRef('http://dbpedia.org/ontology/country'), URIRef('http://es.dbpedia.org/resource/España') )) )) return [ str(typ) for typ in g.objects(URIRef(uri), RDF.type) ]
def get_mediator_account(user_uuid): uri = URIRef("http://vocab.ox.ac.uk/owner/%s"%user_uuid) graph = Graph() graph.parse(ag.mediatorslist) for account in graph.objects(uri, namespaces['foaf']['account']): if account: return account return False
def test_remove_definition(self): with open(filepath('test-patch-remove-definition.json')) as f: patch1 = f.read() with self.client as client: res = client.patch( '/d/', data=patch1, content_type='application/json', headers={'Authorization': 'Bearer ' + 'NTAwNWViMTgtYmU2Yi00YWMwLWIwODQtMDQ0MzI4OWIzMzc4'}) patch_url = urlparse(res.headers['Location']).path res = client.post( patch_url + 'merge', headers={'Authorization': 'Bearer ' + 'ZjdjNjQ1ODQtMDc1MC00Y2I2LThjODEtMjkzMmY1ZGFhYmI4'}) self.assertEqual(res.status_code, http.client.NO_CONTENT) removed_entities = database.get_removed_entity_keys() self.assertEqual(removed_entities, set(['p0trgkvwbjd'])) res = client.get('/trgkvwbjd', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.GONE) res = client.get('/trgkvwbjd.json', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.GONE) res = client.get('/trgkvwbjd?version=0', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.NOT_FOUND) res = client.get('/trgkvwbjd.json?version=0', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.NOT_FOUND) res = client.get('/trgkvwbjd?version=1', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.OK) res = client.get('/trgkvwbjd.json?version=1', headers={'Accept': 'application/json'}, follow_redirects=True) self.assertEqual(res.status_code, http.client.OK) res = client.get('/h') g = ConjunctiveGraph() g.parse(format='json-ld', data=res.get_data(as_text=True)) invalidated = g.value(subject=PERIODO['p0h#change-2'], predicate=PROV.invalidated, any=False) self.assertEqual(invalidated, PERIODO['p0trgkvwbjd']) generated = list(g.objects(subject=PERIODO['p0h#change-2'], predicate=PROV.generated)) self.assertEqual(len(generated), 2) self.assertIn(PERIODO['p0d?version=2'], generated) self.assertIn(PERIODO['p0trgkv?version=2'], generated)
def get_vocab_properties(vocabprefix): vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix) vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix) vocabstatusfile = os.path.join(vocabdir, "status.rdf") properties = {} properties['uri'] = vocab_uri if not os.path.isfile(vocabstatusfile): return properties properties['path'] = vocabdir properties['preferredNamespaceUri'] = None properties['preferredNamespacePrefix'] = None graph = Graph() graph.parse(vocabstatusfile) for o in graph.objects(None, namespaces['vann']['preferredNamespaceUri']): properties['preferredNamespaceUri'] = o for o in graph.objects(None, namespaces['vann']['preferredNamespacePrefix']): properties['preferredNamespacePrefix'] = o return properties
class Store: def __init__(self): self.graph = ConjunctiveGraph() if os.path.exists(storefn): self.graph.load(storeuri, format='n3') self.graph.bind('dc', 'http://purl.org/dc/elements/1.1/') self.graph.bind('foaf', 'http://xmlns.com/foaf/0.1/') self.graph.bind('imdb', 'http://www.csd.abdn.ac.uk/~ggrimnes/dev/imdb/IMDB#') self.graph.bind('rev', 'http://purl.org/stuff/rev#') def save(self): self.graph.serialize(storeuri, format='n3') def who(self, who=None): if who is not None: name, email = (r_who.match(who).group(1), r_who.match(who).group(2)) self.graph.add( (URIRef(storeuri), DC['title'], Literal(title % name))) self.graph.add( (URIRef(storeuri + '#author'), RDF.type, FOAF['Person'])) self.graph.add( (URIRef(storeuri + '#author'), FOAF['name'], Literal(name))) self.graph.add( (URIRef(storeuri + '#author'), FOAF['mbox'], Literal(email))) self.save() else: return self.graph.objects(URIRef(storeuri + '#author'), FOAF['name']) def new_movie(self, movie): movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add((movieuri, RDF.type, IMDB['Movie'])) self.graph.add((movieuri, DC['title'], Literal(movie['title']))) self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year'])))) self.save() def new_review(self, movie, date, rating, comment=None): review = BNode( ) # @@ humanize the identifier (something like #rev-$date) movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add( (movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review)))) self.graph.add((review, RDF.type, REV['Review'])) self.graph.add((review, DC['date'], Literal(date))) self.graph.add((review, REV['maxRating'], Literal(5))) self.graph.add((review, REV['minRating'], Literal(0))) self.graph.add((review, REV['reviewer'], URIRef(storeuri + '#author'))) self.graph.add((review, REV['rating'], Literal(rating))) if comment is not None: self.graph.add((review, REV['text'], Literal(comment))) self.save() def movie_is_in(self, uri): return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph
class Store: def __init__(self): self.graph = ConjunctiveGraph() if os.path.exists(storefn): self.graph.load(storeuri, format="n3") self.graph.bind("dc", DC) self.graph.bind("foaf", FOAF) self.graph.bind("imdb", IMDB) self.graph.bind("rev", "http://purl.org/stuff/rev#") def save(self): self.graph.serialize(storeuri, format="n3") def who(self, who=None): if who is not None: name, email = (r_who.match(who).group(1), r_who.match(who).group(2)) self.graph.add( (URIRef(storeuri), DC["title"], Literal(title % name))) self.graph.add( (URIRef(storeuri + "#author"), RDF.type, FOAF["Person"])) self.graph.add( (URIRef(storeuri + "#author"), FOAF["name"], Literal(name))) self.graph.add( (URIRef(storeuri + "#author"), FOAF["mbox"], Literal(email))) self.save() else: return self.graph.objects(URIRef(storeuri + "#author"), FOAF["name"]) def new_movie(self, movie): movieuri = URIRef("http://www.imdb.com/title/tt%s/" % movie.movieID) self.graph.add((movieuri, RDF.type, IMDB["Movie"])) self.graph.add((movieuri, DC["title"], Literal(movie["title"]))) self.graph.add((movieuri, IMDB["year"], Literal(int(movie["year"])))) self.save() def new_review(self, movie, date, rating, comment=None): review = BNode( ) # @@ humanize the identifier (something like #rev-$date) movieuri = URIRef("http://www.imdb.com/title/tt%s/" % movie.movieID) self.graph.add( (movieuri, REV["hasReview"], URIRef("%s#%s" % (storeuri, review)))) self.graph.add((review, RDF.type, REV["Review"])) self.graph.add((review, DC["date"], Literal(date))) self.graph.add((review, REV["maxRating"], Literal(5))) self.graph.add((review, REV["minRating"], Literal(0))) self.graph.add((review, REV["reviewer"], URIRef(storeuri + "#author"))) self.graph.add((review, REV["rating"], Literal(rating))) if comment is not None: self.graph.add((review, REV["text"], Literal(comment))) self.save() def movie_is_in(self, uri): return (URIRef(uri), RDF.type, IMDB["Movie"]) in self.graph
def get_vocab_mediator(vocabprefix): vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix) vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix) vocabstatusfile = os.path.join(vocabdir, "status.rdf") mediators = {} if not os.path.isfile(vocabstatusfile): return mediators graph = Graph() graph.parse(vocabstatusfile) for o in graph.objects(None, namespaces['foaf']['account']): mediators[str(o)] = get_mediator_details(str(o)) return mediators
def get_vocab_files(vocabprefix): #Get list of files for vocabulary vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix) vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix) vocabstatusfile = os.path.join(vocabdir, "status.rdf") vocab_files = {} if not os.path.isfile(vocabstatusfile): return vocab_files graph = Graph() graph.parse(vocabstatusfile) for v in graph.objects(None, namespaces['dcterms']['hasFormat']): v_str = str(v) vocab_files[v_str] = {'name':'', 'format':'', 'path':''} for f in graph.objects(URIRef(v), namespaces['dcterms']['format']): vocab_files[v_str]['format'] = str(f) for n in graph.objects(URIRef(v), namespaces['nfo']['fileName']): vocab_files[v_str]['name'] = str(n) for p in graph.objects(URIRef(v), namespaces['nfo']['fileUrl']): vocab_files[v_str]['path'] = str(p).replace('file://', '') return vocab_files
class Store: def __init__(self): self.graph = ConjunctiveGraph() if os.path.exists(storefn): self.graph.load(storeuri, format='n3') self.graph.bind('dc', DC) self.graph.bind('foaf', FOAF) self.graph.bind('imdb', IMDB) self.graph.bind('rev', 'http://purl.org/stuff/rev#') def save(self): self.graph.serialize(storeuri, format='n3') def who(self, who=None): if who is not None: name, email = (r_who.match(who).group(1), r_who.match(who).group(2)) self.graph.add((URIRef(storeuri), DC['title'], Literal(title % name))) self.graph.add((URIRef(storeuri + '#author'), RDF.type, FOAF['Person'])) self.graph.add((URIRef(storeuri + '#author'), FOAF['name'], Literal(name))) self.graph.add((URIRef(storeuri + '#author'), FOAF['mbox'], Literal(email))) self.save() else: return self.graph.objects(URIRef(storeuri + '#author'), FOAF['name']) def new_movie(self, movie): movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add((movieuri, RDF.type, IMDB['Movie'])) self.graph.add((movieuri, DC['title'], Literal(movie['title']))) self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year'])))) self.save() def new_review(self, movie, date, rating, comment=None): review = BNode() # @@ humanize the identifier (something like #rev-$date) movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add((movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review)))) self.graph.add((review, RDF.type, REV['Review'])) self.graph.add((review, DC['date'], Literal(date))) self.graph.add((review, REV['maxRating'], Literal(5))) self.graph.add((review, REV['minRating'], Literal(0))) self.graph.add((review, REV['reviewer'], URIRef(storeuri + '#author'))) self.graph.add((review, REV['rating'], Literal(rating))) if comment is not None: self.graph.add((review, REV['text'], Literal(comment))) self.save() def movie_is_in(self, uri): return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph
def get_vocab_editorial_note(vocabprefix): vocab_uri = URIRef("http://vocab.ox.ac.uk/%s"%vocabprefix) vocabdir = os.path.join(ag.vocabulariesdir, vocabprefix) vocabstatusfile = os.path.join(vocabdir, "status.rdf") msgs = [] if not os.path.isfile(vocabstatusfile): return msgs graph = Graph() graph.parse(vocabstatusfile) for s, p, o in graph.triples((None, namespaces['skos']['editorialNote'], None)): nm = None for n in graph.objects(URIRef(s), namespaces['nfo']['fileName']): nm = str(n) msgs.append((str(o), nm)) return msgs
def _get_thing_graph(td): g = td.resource.to_graph() def_g = ConjunctiveGraph(identifier=td.resource.node) for ns, uri in R.agora.fountain.prefixes.items(): def_g.bind(ns, uri) for s, p, o in g: def_g.add((s, p, o)) td_node = td.node if not list(def_g.objects(td.resource.node, CORE.describedBy)): def_g.add((td.resource.node, CORE.describedBy, td_node)) return def_g
def get_vocab_description(vocabfile, vocabprefix): if not os.path.isfile(vocabfile): return {} graph = Graph() try: graph.parse(vocabfile) except: graph = None graph = Graph() try: graph.parse(vocabfile, format="n3") except: return {} descriptions = defaultdict(list) base = None properties = get_vocab_properties(vocabprefix) if 'preferredNamespaceUri' in properties and properties['preferredNamespaceUri']: base = properties['preferredNamespaceUri'] else: (id, base, prefix) = get_vocab_base(vocabfile) if base: for k, predicates in vocab_description_uri.iteritems(): for p in predicates: vals = None vals = graph.objects(URIRef(base), p) for val in vals: if not val in descriptions[k]: descriptions[k].append(val) for k, predicates in vocab_description.iteritems(): if not k in descriptions or not descriptions[k]: for p in predicates: vals = graph.objects(None, p) for val in vals: if not val in descriptions[k]: descriptions[k].append(val) return dict(descriptions)
def get_influence_links(): for wp_url in set(list(G.subjects())): m = re.match("https://en.wikipedia.org/wiki/(.+)", wp_url) if not m: continue title = m.group(1) dbpedia_url = URIRef('http://dbpedia.org/resource/%s' % title) dbp = ConjunctiveGraph() dbp.parse(dbpedia_url) for o in dbp.objects(dbpedia_url, dbpedia.influencedBy): m = re.match("http://dbpedia.org/resource/(.+)$", o) if not m: continue wp_url2 = URIRef("https://en.wikipedia.org/wiki/" + m.group(1)) if len(list(G.predicate_objects(wp_url2))) > 0: G.add((wp_url, dbpedia.influencedBy, wp_url2))
def test_encoding_rdf(): # With encoding specified encoding = "ISO-8859-1" csvw = CSVW(csv_path="./tests/iso_encoding.csv", metadata_path="./tests/iso_encoding.csv-metadata.json", csv_encoding=encoding) rdf_output = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_output, format="turtle") units = Namespace('http://example.org/units/') cars = Namespace('http://example.org/cars/') meta = Namespace("http://example.org/properties/") expected_unit = units[quote(u"\xb5100".encode('utf-8'))] assert (cars['1'], meta['UnitOfMeasurement'], expected_unit) in g assert expected_unit in list(g.objects())
def test_included_schemas(self): model = ConjunctiveGraph() add_default_schemas(model) # rdf test s = [RDF, DC["title"], None] title = model.objects(RDF, DC["title"]) self.assertTrue(title is not None) s = [RDF["Property"], RDF["type"], RDFS["Class"]] self.assertIn(s, model) # rdfs test s = [RDFS["Class"], RDF["type"], RDFS["Class"]] self.assertIn(s, model) s = [OWL["inverseOf"], RDF["type"], RDF["Property"]] self.assertIn(s, model)
def test_null_values_with_multiple_strings(): csvw = CSVW(csv_path="tests/null1.csv", metadata_path="tests/null1.multiple.csv-metadata.json") rdf_contents = csvw.to_rdf() g = ConjunctiveGraph() g.parse(data=rdf_contents, format="turtle") all_objects = {x for x in g.objects()} assert Literal('null_key', datatype=XSD.token) not in all_objects assert Literal('null_sector') not in all_objects assert Literal('null_id', datatype=XSD.token) not in all_objects for id in ['10', '11', '12', '13']: assert Literal(id, datatype=XSD.token) not in all_objects all_preds = {x for x in g.predicates()} assert id_uri not in all_preds assert Literal('1', datatype=XSD.token) not in all_objects
def handle(self, **options): _logger.debug("linking places") for place in models.Place.objects.filter(dbpedia__isnull=True): if not place.city or not place.state: continue # formulate a dbpedia place uri path = urllib2.quote('%s,_%s' % (_clean(place.city), _clean(place.state))) url = URIRef('http://dbpedia.org/resource/%s' % path) # attempt to get a graph from it graph = ConjunctiveGraph() try: _logger.debug("looking up %s" % url) graph.load(url) except urllib2.HTTPError, e: _logger.error(e) # if we've got more than 3 assertions extract some stuff from # the graph and save back some info to the db, would be nice # to have a triple store underneath where we could persist # all the facts eh? if len(graph) >= 3: place.dbpedia = url place.latitude = graph.value(url, geo['lat']) place.longitude = graph.value(url, geo['long']) for object in graph.objects(URIRef(url), owl['sameAs']): if object.startswith('http://sws.geonames.org'): place.geonames = object place.save() _logger.info("found dbpedia resource %s" % url) else: _logger.warn("couldn't find dbpedia resource for %s" % url) reset_queries()
class Inspector(object): """ Class that includes methods for querying an RDFS/OWL ontology """ def __init__(self, uri, language=""): super(Inspector, self).__init__() self.rdfGraph = ConjunctiveGraph() try: self.rdfGraph.parse(uri, format="application/rdf+xml") except: try: self.rdfGraph.parse(uri, format="n3") except: raise exceptions.Error("Could not parse the file! Is it a valid RDF/OWL ontology?") finally: self.baseURI = self.get_OntologyURI() or uri self.allclasses = self.__getAllClasses(includeDomainRange=True, includeImplicit=True, removeBlankNodes=False, excludeRDF_OWL=False) def get_OntologyURI(self, return_as_string=True): test = [x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology))] if test: if return_as_string: return str(test[0]) else: return test[0] else: return None def __getAllClasses(self, classPredicate="", includeDomainRange=False, includeImplicit=False, removeBlankNodes=True, addOWLThing=True, excludeRDF_OWL=True): rdfGraph = self.rdfGraph exit = {} def addIfYouCan(x, mydict): if excludeRDF_OWL: if x.startswith('http://www.w3.org/2002/07/owl#') or \ x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \ x.startswith("http://www.w3.org/2000/01/rdf-schema#"): return mydict if x not in mydict: mydict[x] = None return mydict if addOWLThing: exit = addIfYouCan(Thing, exit) if classPredicate == "rdfs" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, RDFS.Class): exit = addIfYouCan(s, exit) if classPredicate == "owl" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, Class): exit = addIfYouCan(s, exit) if includeDomainRange: for o in rdfGraph.objects(None, RDFS.domain): exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDFS.range): exit = addIfYouCan(o, exit) if includeImplicit: for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)): exit = addIfYouCan(s, exit) exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDF.type): exit = addIfYouCan(o, exit) # get a list exit = exit.keys() if removeBlankNodes: exit = [x for x in exit if not isBlankNode(x)] return sort_uri_list_by_name(exit) def __getTopclasses(self, classPredicate=''): returnlist = [] for eachclass in self.__getAllClasses(classPredicate): x = self.get_classDirectSupers(eachclass) if not x: returnlist.append(eachclass) return sort_uri_list_by_name(returnlist) def __getTree(self, father=None, out=None): if not father: out = {} topclasses = self.toplayer out[0] = topclasses for top in topclasses: children = self.get_classDirectSubs(top) out[top] = children for potentialfather in children: self.__getTree(potentialfather, out) return out else: children = self.get_classDirectSubs(father) out[father] = children for ch in children: self.__getTree(ch, out) def __buildClassTree(self, father=None, out=None): if not father: out = {} topclasses = self.toplayer out[0] = [Thing] out[Thing] = sort_uri_list_by_name(topclasses) for top in topclasses: children = self.get_classDirectSubs(top) out[top] = sort_uri_list_by_name(children) for potentialfather in children: self.__buildClassTree(potentialfather, out) return out else: children = self.get_classDirectSubs(father) out[father] = sort_uri_list_by_name(children) for ch in children: self.__buildClassTree(ch, out) # methods for getting ancestores and descendants of classes: by default, we do not include blank nodes def get_classDirectSupers(self, aClass, excludeBnodes=True, sortUriName=False): returnlist = [] for o in self.rdfGraph.objects(aClass, RDFS.subClassOf): if not (o == Thing): if excludeBnodes: if not isBlankNode(o): returnlist.append(o) else: returnlist.append(o) if sortUriName: return sort_uri_list_by_name(remove_duplicates(returnlist)) else: return remove_duplicates(returnlist) def get_classDirectSubs(self, aClass, excludeBnodes=True): returnlist = [] for s, v, o in self.rdfGraph.triples((None, RDFS.subClassOf, aClass)): if excludeBnodes: if not isBlankNode(s): returnlist.append(s) else: returnlist.append(s) return sort_uri_list_by_name(remove_duplicates(returnlist)) def get_classSiblings(self, aClass, excludeBnodes=True): returnlist = [] for father in self.get_classDirectSupers(aClass, excludeBnodes): for child in self.get_classDirectSubs(father, excludeBnodes): if child != aClass: returnlist.append(child) return sort_uri_list_by_name(remove_duplicates(returnlist)) def entitySynonyms(self, anEntity, language=DEFAULT_LANGUAGE, getall=True): if getall: temp = [] # Uberon synonyms for o in self.rdfGraph.objects(anEntity, Synonym): temp += [o] # EFO synonyms for o in self.rdfGraph.objects(anEntity, EFO_Synonym): temp += [o] # OBI synonyms for o in self.rdfGraph.objects(anEntity, OBO_Synonym): temp += [o] return temp else: for o in self.rdfGraph.objects(anEntity, Synonym): if getattr(o, 'language') and getattr(o, 'language') == language: return o return "" def classFind(self, name, exact=False): temp = [] if name: for x in self.allclasses: if exact: if x.__str__().lower() == str(name).lower(): return [x] else: if x.__str__().lower().find(str(name).lower()) >= 0: temp.append(x) return temp
# step 1: find all the classes. rdftype = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") rdfsdomain = URIRef("http://www.w3.org/2000/01/rdf-schema#domain") rdfsrange = URIRef("http://www.w3.org/2000/01/rdf-schema#range") rdfsresource = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Resource") rdfssco = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") asColl = URIRef("http://www.w3.org/ns/activitystreams#OrderedCollection") skosConcept = URIRef("http://www.w3.org/2004/02/skos/core#Concept") otherClasses = [asColl, skosConcept] classes = list(g.subjects(rdftype, URIRef("http://www.w3.org/2000/01/rdf-schema#Class"))) props = list(g.subjects(rdftype, URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"))) for p in props: domains = list(g.objects(p, rdfsdomain)) for d in domains: assert(d in classes) for p in props: ranges = list(g.objects(p, rdfsrange)) for r in ranges: if not r in classes and not str(r).startswith("http://www.w3.org/2001/XMLSchema#") and \ not r == rdfsresource: print "Found inconsistent property: %s has unknown range" % p for c in classes: parents = list(g.objects(c, rdfssco)) for p in parents: if not p in classes and not p in otherClasses: print "Found inconsistent class: %s has unknown superClass" % c
def __load_citations_from_rdf_file(data_f_path, prov_f_path, service_name, id_type, id_shape, citation_type): citation_data = Graph() citation_data.load(data_f_path, format="nt11") citation_prov = ConjunctiveGraph() citation_prov.load(prov_f_path, format="nquads") for cit_ent in citation_data.subjects(RDF.type, Citation.citation): prov_entity = None snapshot = 0 for entity in citation_prov.subjects(Citation.specialization_of, cit_ent): entity_snapshot = int(sub("^.+/se/(.+)$", "\\1", entity)) if prov_entity is None or snapshot < entity_snapshot: prov_entity = entity snapshot = entity_snapshot invalidated = None update = None creation_date = None timespan = None for en in citation_prov.objects(prov_entity, Citation.invalidated_at_time): invalidated = str(en) for en in citation_prov.objects(prov_entity, Citation.has_update_query): update = str(en) for en in citation_data.objects( cit_ent, Citation.has_citation_creation_date): creation_date = str(en) for en in citation_data.objects(cit_ent, Citation.has_citation_time_span): timespan = str(en) c = Citation( sub("^.+/ci/(.+)$", "\\1", str(cit_ent)), str( list( citation_data.objects(cit_ent, Citation.has_citing_entity))[0]), None, str( list( citation_data.objects(cit_ent, Citation.has_cited_entity))[0]), None, creation_date, timespan, entity_snapshot, str( list( citation_prov.objects(prov_entity, Citation.was_attributed_to))[0]), str( list( citation_prov.objects( prov_entity, Citation.had_primary_source))[0]), str( list( citation_prov.objects(prov_entity, Citation.generated_at_time))[0]), service_name, id_type, id_shape, citation_type, Citation.journal_self_citation in citation_data.objects( cit_ent, RDF.type), Citation.author_self_citation in citation_data.objects(cit_ent, RDF.type), invalidated, str( list( citation_prov.objects(prov_entity, Citation.description))[0]), update) yield c
class Owler(object): """ Class that includes methods for building an RDF graph from an OWL ontology and retrieving information from it """ def __init__(self, uri, language=""): super(Owler, self).__init__() self.rdfGraph = ConjunctiveGraph() try: self.rdfGraph.parse(uri, format="application/rdf+xml") except: try: self.rdfGraph.parse(uri, format="n3") except: raise exceptions.Error("Could not parse the file! Is it a valid RDF/OWL ontology?") finally: self.baseURI = self.__get_OntologyURI() or uri self.allclasses = self.__getAllClasses(includeDomainRange=True, includeImplicit=True, removeBlankNodes=False, excludeRDF_OWL=False) def __get_OntologyURI(self, return_as_string=True): test = [x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology))] if test: if return_as_string: return str(test[0]) else: return test[0] else: return None def __getAllClasses(self, classPredicate="", includeDomainRange=False, includeImplicit=False, removeBlankNodes=True, addOWLThing=True, excludeRDF_OWL=True): rdfGraph = self.rdfGraph exit = {} def addIfYouCan(x, mydict): if excludeRDF_OWL: if x.startswith('http://www.w3.org/2002/07/owl#') or \ x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \ x.startswith("http://www.w3.org/2000/01/rdf-schema#"): return mydict if x not in mydict: mydict[x] = None return mydict if addOWLThing: exit = addIfYouCan(Thing, exit) if classPredicate == "rdfs" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, RDFS.Class): exit = addIfYouCan(s, exit) if classPredicate == "owl" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, Class): exit = addIfYouCan(s, exit) if includeDomainRange: for o in rdfGraph.objects(None, RDFS.domain): exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDFS.range): exit = addIfYouCan(o, exit) if includeImplicit: for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)): exit = addIfYouCan(s, exit) exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDF.type): exit = addIfYouCan(o, exit) # get a list exit = exit.keys() if removeBlankNodes: exit = [x for x in exit if not isBlankNode(x)] return sortUriListByName(exit) # methods for getting ancestors and descendants of classes: by default, we do not include blank nodes def get_classDirectSupers(self, aClass, excludeBnodes=True, sortUriName=False): returnlist = [] for o in self.rdfGraph.objects(aClass, RDFS.subClassOf): if not (o == Thing): if excludeBnodes: if not isBlankNode(o): returnlist.append(o) else: returnlist.append(o) if sortUriName: return sortUriListByName(removeDuplicates(returnlist)) else: return removeDuplicates(returnlist)
from pprint import pprint print("All the things in the Graph:") pprint(list(primer)) # just think .whatever((s, p, o)) # here we report on what we know print("==================") print("Subjects:") pprint(list(primer.subjects())) print("Predicates:") pprint(list(primer.predicates())) print("Objects:") pprint(list(primer.objects())) print("==================") # and other things that make sense print("What we know about pat:") pprint(list(primer.predicate_objects(myNS.pat))) print("Who is what age?") pprint(list(primer.subject_objects(myNS.age))) print("==================") print("==================") # Okay, so lets now work with a bigger # dataset from the example, and start
def handle(self, **options): LOGGER.debug("linking places") for place in models.Place.objects.filter(dbpedia__isnull=True): if not place.city or not place.state: continue # formulate a dbpedia place uri path = urllib2.quote('%s,_%s' % (_clean(place.city), _clean(place.state))) url = URIRef('http://dbpedia.org/resource/%s' % path) # attempt to get a graph from it graph = ConjunctiveGraph() try: LOGGER.debug("looking up %s" % url) graph.load(url) except urllib2.HTTPError as e: LOGGER.error(e) # if we've got more than 3 assertions extract some stuff from # the graph and save back some info to the db, would be nice # to have a triple store underneath where we could persist # all the facts eh? if len(graph) >= 3: place.dbpedia = url place.latitude = graph.value(url, geo['lat']) place.longitude = graph.value(url, geo['long']) for object in graph.objects(URIRef(url), owl['sameAs']): if object.startswith('http://sws.geonames.org'): place.geonames = object place.save() LOGGER.info("found dbpedia resource %s" % url) else: LOGGER.warn("couldn't find dbpedia resource for %s" % url) reset_queries() LOGGER.info("finished looking up places in dbpedia") LOGGER.info("dumping place_links.json fixture") # so it would be nice to use django.core.serializer here # but it serializes everything about the model, including # titles that are linked to ... and this could theoretically # change over time, so we only preserve the facts that have # been harvested from dbpedia, so they can overlay over # the places that have been extracted during title load json_src = [] places_qs = models.Place.objects.filter(dbpedia__isnull=False) for p in places_qs.order_by('name'): json_src.append({ 'name': p.name, 'dbpedia': p.dbpedia, 'geonames': p.geonames, 'longitude': p.longitude, 'latitude': p.latitude }) reset_queries() json.dump(json_src, file('core/fixtures/place_links.json', 'w'), indent=2) LOGGER.info("finished dumping place_links.json fixture")
range = class_match.group(1) label = class_match.group(2) parts = re.split(r' +', label) label = ' '.join(l.lower().capitalize() for l in parts).strip() position = 0 else: parts = line.split("\t") label = parts.pop().strip() range = parts.pop(0).strip() position = len(parts) + 1 # if there's no range then we've got a chunk of text that needs # to be added to the last concept we added to the graph if not range: uri = range_uri(lc_class[-1][0]) old_label = list(g.objects(uri, SKOS.prefLabel))[0] new_label = "%s %s" % (old_label, label) g.remove((uri, SKOS.prefLabel, old_label)) g.add((uri, SKOS.prefLabel, Literal(new_label, 'en'))) continue lc_class = lc_class[0:position] lc_class.insert(position, (range, label)) label = '--'.join([c[1] for c in lc_class]) uri = range_uri(range) g.add((uri, RDF.type, SKOS.Concept)) g.add((uri, SKOS.prefLabel, Literal(label, 'en'))) g.add((uri, SKOS.notation, Literal(range, datatype=LCC)))
class Store: def __init__(self, storefile = config['STORE_FILE'], storeuri = config['STORE_URI'], structfile = config['STRUCT_FILE'], structuri = config['STRUCT_URI'], namespaces = config['NAMESPACES'], sensors = config['SENSORS'], format = config['FORMAT']): self.storeuri = storeuri self.storefile = storefile self.structuri = structuri self.structfile = structfile self.namespaces = namespaces self.sensors = sensors self.format = format for namespace, uri in namespaces.iteritems(): self.namespaces[namespace] = uri self.reset() def reset(self, new_graph=None): if new_graph is not None: self.graph = new_graph else: self.graph = ConjunctiveGraph() for namespace, uri in self.namespaces.iteritems(): self.graph.bind(namespace, uri) if os.path.exists(self.structfile): self.graph.load(self.structuri, format=self.format) if os.path.exists(self.storefile): self.graph.load(self.storeuri, format=self.format) def save(self, format=None): if not format: format = self.format self.graph.serialize(self.storeuri, format=format) def get(self, something): pass def snapshot(self): self.reset() for sensor in self.sensors: constructor = globals()[sensor] instance = constructor() instance.snapshot() self.graph = self.graph + instance.graph @property def queries(self): q = self.basequeries for sensor in self.sensors: constructor = globals()[sensor] q = dict(q, **constructor.queries) return q @property def local(self): """ Idee: local config only """ hostname = getHostname() query = "?m a :Machine . ?m location %s" % hostname return hostname def on(self, ip=None): """ Idee: store.on('192.168.1.43').query('SELECT..') """ def query(self, sparql): return self.graph.query(sparql) def register(self): name = getHostname() location = "" def who(self, who=None): if who is not None: name, email = (r_who.match(who).group(1), r_who.match(who).group(2)) self.graph.add((URIRef(storeuri), DC['title'], Literal(title % name))) self.graph.add((URIRef(storeuri+'#author'), RDF.type, FOAF['Person'])) self.graph.add((URIRef(storeuri+'#author'), FOAF['name'], Literal(name))) self.graph.add((URIRef(storeuri+'#author'), FOAF['mbox'], Literal(email))) self.save() else: return self.graph.objects(URIRef(storeuri+'#author'), FOAF['name']) def new_movie(self, movie): movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add((movieuri, RDF.type, IMDB['Movie'])) self.graph.add((movieuri, DC['title'], Literal(movie['title']))) self.graph.add((movieuri, IMDB['year'], Literal(int(movie['year'])))) self.save() def new_review(self, movie, date, rating, comment=None): review = BNode() # @@ humanize the identifier (something like #rev-$date) movieuri = URIRef('http://www.imdb.com/title/tt%s/' % movie.movieID) self.graph.add((movieuri, REV['hasReview'], URIRef('%s#%s' % (storeuri, review)))) self.graph.add((review, RDF.type, REV['Review'])) self.graph.add((review, DC['date'], Literal(date))) self.graph.add((review, REV['maxRating'], Literal(5))) self.graph.add((review, REV['minRating'], Literal(0))) self.graph.add((review, REV['reviewer'], URIRef(storeuri+'#author'))) self.graph.add((review, REV['rating'], Literal(rating))) print comment if comment is not None: self.graph.add((review, REV['text'], Literal(comment))) self.save() def movie_is_in(self, uri): return (URIRef(uri), RDF.type, IMDB['Movie']) in self.graph basequeries = { "Trifle Entities": """ SELECT ?Subject ?Object WHERE { ?Subject rdfs:subClassOf ?Object } """ }
def handle(self, *args, **options): es = settings.ELASTIC_SEARCH_URL db = os.path.join(settings.BASE_DIR, "db") print(es) graph = ConjunctiveGraph('Sleepycat') graph.open(db, create=False) graph.bind('skos', SKOS) EU = Namespace('http://eurovoc.europa.eu/schema#') UNBIST = Namespace('http://unontologies.s3-website-us-east-1.amazonaws.com/unbist#') querystring = "select ?uri where { ?uri rdf:type skos:Concept filter not exists { ?uri rdf:type unbist:PlaceName } . }" index = 1 # make the index: thes_index = { "mappings": { "terms": { "properties": { "scope_notes": { "type": "string" }, "uri": { "type": "string" }, "alt_labels": { "type": "string" }, "alt_labels_orig": { "type": "string", "index": "not_analyzed" }, "labels": { "type": "string" }, "labels_orig": { "type": "string", "index": "not_analyzed" } } } } } r = requests.put(es + 'thesaurus/', data=json.dumps(thes_index)) for uri in graph.query(querystring): this_uri = uri[0] doc = { "uri": this_uri } pref_labels = [] labels_orig_lc = [] print("Getting preferred labels") for label in graph.preferredLabel(URIRef(this_uri)): pref_labels.append(label[1]) if label[1].language in ['en','fr','es']: labels_orig_lc.append(label[1].lower()) doc.update({"labels": pref_labels}) doc.update({"labels_orig": pref_labels + labels_orig_lc}) alt_labels = [] alt_labels_orig_lc = [] print("Getting alternate labels") for label in graph.objects(URIRef(this_uri), SKOS.altLabel): alt_labels.append(label) if label.language in ['en','fr','es']: alt_labels_orig_lc.append(label.lower()) doc.update({"alt_labels": alt_labels}) doc.update({"alt_labels_orig": alt_labels + alt_labels_orig_lc}) scope_notes = [] print("Getting scope notes") for sn in graph.objects(URIRef(this_uri), SKOS.scopeNote): scope_notes.append(sn) doc.update({"scope_notes": scope_notes}) payload = json.dumps(doc) r = requests.put(es + 'thesaurus/terms/' + str(index), data=payload) index += 1
def test_multiple_value_urls_in_virtual(): csvw = CSVW(csv_path="tests/value_urls.csv", metadata_path="tests/value_urls.csv-metadata.json") rdf_contents = csvw.to_rdf(fmt="nt") g = ConjunctiveGraph() g.parse(data=rdf_contents, format="nt") # Test subjects all_subjects = list(g.subjects()) s_amount = NS['amount'] s_desc = NS['description'] s_id = NS['id'] assert s_amount in all_subjects assert s_desc in all_subjects assert s_id in all_subjects # Test descriptions p_def = NS['definition'] assert len(list(g.triples( (s_amount, p_def, Literal("the amount paid"))))) == 1 assert len( list(g.triples( (s_desc, p_def, Literal("description of the expense"))))) == 1 assert len(list(g.triples((s_id, p_def, Literal("transaction id"))))) == 1 # Test each is a element type o_element = NS['element'] assert len(list(g.triples((s_amount, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_desc, RDF.type, o_element)))) == 1 assert len(list(g.triples((s_id, RDF.type, o_element)))) == 1 # Test that range is specified r_amount = NS['element/amount-RANGE'] r_desc = NS['element/description-RANGE'] r_id = NS['element/id-RANGE'] assert len(list(g.triples((s_amount, RDFS.range, r_amount)))) == 1 assert len(list(g.triples((s_desc, RDFS.range, r_desc)))) == 1 assert len(list(g.triples((s_id, RDFS.range, r_id)))) == 1 # Range is another subject assert r_amount in all_subjects assert r_desc in all_subjects assert r_id in all_subjects # Range is a OWL datatype of specified type assert len(list(g.triples((r_amount, OWL.onDatatype, XSD.decimal)))) == 1 assert len(list(g.triples((r_desc, OWL.onDatatype, XSD.string)))) == 1 assert len(list(g.triples((r_id, OWL.onDatatype, XSD.integer)))) == 1 # Check the restrictions for amount rest_amount_node = list(g.triples((r_amount, OWL.withRestrictions, None))) rest_amount_node = rest_amount_node[0][2] assert isinstance(rest_amount_node, BNode) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.decimal)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MaxLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(10, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_amount_node, RDF.first, XSD.MinLength)))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len( list( g.triples((rest_amount_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 rest_amount_node = next( g.objects(subject=rest_amount_node, predicate=RDF.rest)) assert len(list(g.triples((rest_amount_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_amount_node, RDF.rest, None)))) == 0 # Check the restrictions for description rest_desc_node = list(g.triples((r_desc, OWL.withRestrictions, None))) rest_desc_node = rest_desc_node[0][2] assert isinstance(rest_desc_node, BNode) assert len(list(g.triples((rest_desc_node, RDF.first, XSD.string)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples( (rest_desc_node, RDF.first, XSD.MaxLength)))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len( list( g.triples((rest_desc_node, RDF.first, Literal(100, datatype=XSD.nonNegativeInteger))))) == 1 rest_desc_node = next(g.objects(subject=rest_desc_node, predicate=RDF.rest)) assert len(list(g.triples((rest_desc_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_desc_node, RDF.rest, None)))) == 0 # Check the restrictions for id rest_id_node = list(g.triples((r_id, OWL.withRestrictions, None))) rest_id_node = rest_id_node[0][2] assert isinstance(rest_id_node, BNode) assert len(list(g.triples((rest_id_node, RDF.first, XSD.integer)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, XSD.MinLength)))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len( list( g.triples((rest_id_node, RDF.first, Literal(0, datatype=XSD.nonNegativeInteger))))) == 1 rest_id_node = next(g.objects(subject=rest_id_node, predicate=RDF.rest)) assert len(list(g.triples((rest_id_node, RDF.first, None)))) == 0 assert len(list(g.triples((rest_id_node, RDF.rest, None)))) == 0 # Check constant value for each const_prop = NS['another-list-value-with-constants'] for s in [r_amount, r_id, r_desc]: constant_node = list(g.triples((r_amount, const_prop, None))) constant_node = constant_node[0][2] assert isinstance(constant_node, BNode) assert len(list(g.triples( (constant_node, RDF.first, XSD.Length)))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len( list( g.triples((constant_node, RDF.first, Literal(1, datatype=XSD.nonNegativeInteger))))) == 1 constant_node = next( g.objects(subject=constant_node, predicate=RDF.rest)) assert len(list(g.triples((constant_node, RDF.first, None)))) == 0 assert len(list(g.triples((constant_node, RDF.rest, None)))) == 0 # Verify that empty valueUrl does not end up in graph or rdf contents assert NS['empty-list-predicate1'] not in list(g.objects()) assert "empty-list-predicate1" not in rdf_contents # Verify that empty valueUrl does not end up in graph assert NS['empty-list-predicate2'] not in list(g.objects()) assert "empty-list-predicate2" not in rdf_contents # Test total number of lists through rdf:nils in order to verify each list # ends up with a nil test_num_lists = 3 * 3 # 3 rows and 3 virtual list valued columns nil_text = "<http://www.w3.org/1999/02/22-rdf-syntax-ns#nil> ." assert rdf_contents.count(nil_text) == test_num_lists
def handle(self, **options): LOGGER.debug("linking places") for place in models.Place.objects.filter(dbpedia__isnull=True): if not place.city or not place.state: continue # formulate a dbpedia place uri path = urllib2.quote('%s,_%s' % (_clean(place.city), _clean(place.state))) url = URIRef('http://dbpedia.org/resource/%s' % path) # attempt to get a graph from it graph = ConjunctiveGraph() try: LOGGER.debug("looking up %s", url) graph.load(url) except urllib2.HTTPError: LOGGER.exception("Error fetching %s", url) # if we've got more than 3 assertions extract some stuff from # the graph and save back some info to the db, would be nice # to have a triple store underneath where we could persist # all the facts eh? if len(graph) >= 3: place.dbpedia = url place.latitude = graph.value(url, geo['lat']) place.longitude = graph.value(url, geo['long']) for object in graph.objects(URIRef(url), owl['sameAs']): if object.startswith('http://sws.geonames.org'): place.geonames = object place.save() LOGGER.info("found dbpedia resource %s", url) else: LOGGER.warning("couldn't find dbpedia resource for %s", url) reset_queries() LOGGER.info("finished looking up places in dbpedia") LOGGER.info("dumping place_links.json fixture") # so it would be nice to use django.core.serializer here # but it serializes everything about the model, including # titles that are linked to ... and this could theoretically # change over time, so we only preserve the facts that have # been harvested from dbpedia, so they can overlay over # the places that have been extracted during title load json_src = [] places_qs = models.Place.objects.filter(dbpedia__isnull=False) for p in places_qs.order_by('name'): json_src.append( { 'name': p.name, 'dbpedia': p.dbpedia, 'geonames': p.geonames, 'longitude': p.longitude, 'latitude': p.latitude, } ) reset_queries() json.dump(json_src, open('core/fixtures/place_links.json', 'w'), indent=2) LOGGER.info("finished dumping place_links.json fixture")
class FairMetricData(): def __init__(self, id): self.base = 'https://purl.org/fair-metrics/' self.id = URIRef(id) self.assertion = URIRef(id+'#assertion') # id = id.replace(self.base, '') # HACK -- remove this line before merging commit self.g = ConjunctiveGraph() self.g.parse(id, format='trig') def getID(self): return self.id def getShortID(self): return self.id.replace(self.base, '') def getAuthors(self): authors = [o.toPython() for o in self.g.objects(subject=self.assertion, predicate=DCTERMS.author)] authors.sort() return ' \\\\ '.join(authors) def getTitle(self): return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=RDFS.comment)]) def getShortTitle(self): return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=DCTERMS.title)]) def getTopicDescription(self): descs = [] for o in self.g.objects(subject=self.id, predicate=FOAF.primaryTopic): # o should be fair:A1.1 for o2 in fairGraph.objects(subject=o, predicate=DCTERMS.description): descs.append(o2.toPython()) return ' '.join(descs) def getTopicTitle(self): descs = [] for o in self.g.objects(subject=self.id, predicate=FOAF.primaryTopic): # o should be fair:A1.1 for o2 in fairGraph.objects(subject=o, predicate=DCTERMS.title): descs.append(o2.toPython()) return ' '.join(descs) def getMeasuring(self): # return fm:measuring return self.getFMPropertyValue(FM.measuring) def getRationale(self): # return fm:rationale return self.getFMPropertyValue(FM.rationale) def getRequirements(self): # return fm:requirements return self.getFMPropertyValue(FM.requirements) def getProcedure(self): # return fm:procedure return self.getFMPropertyValue(FM.procedure) def getValidation(self): # return fm:validation return self.getFMPropertyValue(FM.validation) def getRelevance(self): # return fm:relevance return self.getFMPropertyValue(FM.relevance) def getExamples(self): # return fm:examples return self.getFMPropertyValue(FM.examples) def getComments(self): # return fm:comments return self.getFMPropertyValue(FM.comments) def getFMPropertyLabel(self, property): return ', '.join([ o.toPython() for o in fairTermGraph.objects(subject=FM[property], predicate=RDFS['label'])]) def getFMPropertyValue(self, property): return ', '.join([o.toPython() for o in self.g.objects(subject=self.assertion, predicate=property)])
primer.add((myNS['pat'], myNS['age'], Literal(24))) # Now, with just that, lets see how the system # recorded *way* too many details about what # you just asserted as fact. # from pprint import pprint pprint(list(primer)) # just think .whatever((s, p, o)) # here we report on what we know pprint(list(primer.subjects())) pprint(list(primer.predicates())) pprint(list(primer.objects())) # and other things that make sense # what do we know about pat? pprint(list(primer.predicate_objects(myNS.pat))) # who is what age? pprint(list(primer.subject_objects(myNS.age))) # Okay, so lets now work with a bigger # dataset from the example, and start # with a fresh new graph. primer = ConjunctiveGraph()
class ManifestHelper(object): def __init__(self, uri=None): self.uri = None if uri: self.uri = uri self.reset() def reset(self): self.g = None if self.uri: self.g = ConjunctiveGraph(identifier=self.uri) else: self.g = ConjunctiveGraph() self.namespaces = {} self.urihelper = URIHelper(self.namespaces) #add defaults for prefix, ns in NAMESPACES.iteritems(): self.add_namespace(prefix, ns) def from_string(self, textfile, format="xml", encoding="utf-8"): self.reset() self.g.parse(textfile, format) return def triple_exists(self, s, p, o): if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']: return False if s == '*': s = None if p == '*': p = None if o == '*': o = None if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None: s = self.urihelper.get_uriref(s) if not isinstance(p, URIRef) and not p == None: p = self.urihelper.parse_uri(p) if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode) and not o == None: if not isinstance(o, basestring): o = unicode(o) o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True) count = 0 for ans_s, ans_p, ans_o in self.g.triples((s, p, o)): count += 1 if count > 0: return True else: return False def list_objects(self, s, p): objects = [] if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']: return objects if s == '*': s = None if p == '*': p = None if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None: s = self.urihelper.get_uriref(s) if not isinstance(p, URIRef) and not p == None: p = self.urihelper.parse_uri(p) for o in self.g.objects(s, p): objects.append(o) return objects def add_triple(self, s, p, o): if not isinstance(s, URIRef) and not isinstance(s, BNode): s = self.urihelper.get_uriref(s) if not isinstance(p, URIRef): p = self.urihelper.parse_uri(p) if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode): if not isinstance(o, basestring): o = unicode(o) o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True) self.g.add((s, p, o)) self.g.commit() return def add_namespace(self, prefix, uri): if not isinstance (prefix, basestring): raise TypeError('Add namespace: prefix is not of type string or unicode') if not isinstance(uri, (URIRef, Namespace)): if not isinstance(uri, basestring): raise TypeError('Add namespace: namespace is not of type string or unicode') if not isinstance(prefix, unicode): prefix = unicode(prefix) if isinstance(uri, basestring) and not isinstance(uri, unicode): uri = unicode(uri) self.namespaces[prefix] = self.urihelper.get_namespace(uri) if prefix not in self.urihelper.namespaces: self.urihelper.namespaces[prefix] = self.urihelper.get_namespace(uri) self.g.bind(prefix, self.namespaces[prefix]) return def del_namespace(self, prefix, ns): if prefix in self.namespaces: del self.namespaces[prefix] return def del_triple(self, s, p, o=None): if not type(self.g).__name__ in ['ConjunctiveGraph', 'Graph']: return if s == '*': s = None if p == '*': p = None if o == '*': o = None if not isinstance(s, URIRef) and not isinstance(s, BNode) and not s == None: s = self.urihelper.get_uriref(s) if not isinstance(p, URIRef) and not p == None: p = self.urihelper.parse_uri(p) if not isinstance(o, URIRef) and not isinstance(o, Literal) and not isinstance(o, BNode) and not o == None: if not isinstance(o, basestring): o = unicode(o) o = self.urihelper.parse_uri(o, return_Literal_not_Exception=True) self.g.remove((s, p, o)) return def get_graph(self): return self.g def to_string(self, format="xml"): if type(self.g).__name__ in ['ConjunctiveGraph', 'Graph'] and len(self.g)>0: self.g.commit() ans_str = self.g.serialize(format=format, encoding="utf-8")+"\n" return ans_str else: return u'<?xml version="1.0" encoding="UTF-8"?>\n'
class RDFCrawler: logger = logging.getLogger(__name__) def __init__(self, uri, domains=set()): """ :param uri: root URI to start crawling . :param domains: list of permits domains to crawl. """ self.root = uri self.graph_route = 'graph_store_%s' % hash(self.root) self.graph = ConjunctiveGraph('Sleepycat') self.graph.open(self.graph_route, create=True) self._filter_domains = domains self._filter_domains.add(uri) self.last_process_time = 0.0 self.lock = RLock() def filter_uris(self, uri_list): """ :param uri_list: list of URIs to be filtered. :return: filtered list of URIs. """ return [uri for uri in uri_list for match in self._filter_domains if match in str(uri)] def _has_context(self, graph, subject): """ :param subject: the URIRef or URI to check if it has current context. :return: True if subject has a current context. """ return len(graph.get_context(self._get_context_id(subject))) > 1 @staticmethod def _get_context_id(subject): """ :param subject: URIRef or URI from which the get context id. :return: context id of the resource. Example: subject -> http://www.example.org/#fragment context_id -> http://www.example.org/ """ return str(subject).split('#')[0] def start(self): """ start method for crawling. """ self.lock.acquire(True) # Erase old graph for q in self.graph.quads(): self.graph.remove(q) # Crawl for data logging.info('Start crawling: %s' % self.root) start_time = time.time() self._crawl([self.root]) end_time = time.time() self.last_process_time = end_time - start_time logging.info('Crawling complete after: %s seconds with %s predicates.' % (self.last_process_time, len(self.graph))) self.lock.release() def _crawl(self, uri_list): """ Recursive method that crawl RDF objects :param uri_list: list of URIs to crawl """ if len(uri_list) > 0: for uri in uri_list: try: # A few considerations about parsing params. # publicID = uri due to redirection issues # Format = None due to default params use 'XML' self.graph.parse(uri, publicID=uri, format=None) logging.info('[OK]: %s' % uri) except Exception as e: logging.info('[Error]: %s: %s' % (uri, e)) # Check that there are context that remains without parsing objects = set([self._get_context_id(o) for o in set(self.graph.objects(None, None)) if isinstance(o, URIRef) and not self._has_context(self.graph, o)]) self._crawl(self.filter_uris(objects))
class TabLinker(object): defaultNamespacePrefix = 'http://example.org/resource/' annotationsNamespacePrefix = 'http://example.org/annotation/' namespaces = { 'dcterms':Namespace('http://purl.org/dc/terms/'), 'skos':Namespace('http://www.w3.org/2004/02/skos/core#'), 'tablink':Namespace('http://example.org/ns#'), 'qb':Namespace('http://purl.org/linked-data/cube#'), 'owl':Namespace('http://www.w3.org/2002/07/owl#') } annotationNamespaces = { 'np':Namespace('http://www.nanopub.org/nschema#'), 'oa':Namespace('http://www.w3.org/ns/openannotation/core/'), 'xsd':Namespace('http://www.w3.org/2001/XMLSchema#'), 'dct':Namespace('http://purl.org/dc/terms/') } def __init__(self, filename, config, level = logging.DEBUG): """TabLinker constructor Keyword arguments: filename -- String containing the name of the current Excel file being examined config -- Configuration object, loaded from .ini file level -- A logging level as defined in the logging module """ self.config = config self.filename = filename self.log = logging.getLogger("TabLinker") self.log.setLevel(level) self.log.debug('Initializing Graphs') self.initGraphs() self.log.debug('Setting Scope') basename = os.path.basename(filename) basename = re.search('(.*)\.xls',basename).group(1) self.setScope(basename) self.log.debug('Loading Excel file {0}.'.format(filename)) self.rb = open_workbook(filename, formatting_info=True) self.log.debug('Reading styles') self.styles = Styles(self.rb) self.log.debug('Copied Workbook to writable copy') self.wb = copy(self.rb) def initGraphs(self): """ Initialize the graphs, set default namespaces, and add schema information """ self.graph = ConjunctiveGraph() # Create a separate graph for annotations self.annotationGraph = ConjunctiveGraph() self.log.debug('Adding namespaces to graphs') # Bind namespaces to graphs for namespace in self.namespaces: self.graph.namespace_manager.bind(namespace, self.namespaces[namespace]) # Same for annotation graph for namespace in self.annotationNamespaces: self.annotationGraph.namespace_manager.bind(namespace, self.annotationNamespaces[namespace]) # Add schema information self.log.debug('Adding some schema information (dimension and measure properties) ') self.addDataCellProperty() # Add dimensions self.graph.add((self.namespaces['tablink']['dimension'], RDF.type, self.namespaces['qb']['DimensionProperty'])) #self.graph.add((self.namespaces['tablink']['label'], RDF.type, RDF['Property'])) def addDataCellProperty(self): """ Add definition of data cell resource to graph """ if len(self.config.get('dataCell', 'propertyName')) > 0 : self.dataCellPropertyName = self.config.get('dataCell', 'propertyName') else : self.dataCellPropertyName = 'hasValue' self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDF.type, self.namespaces['qb']['MeasureProperty'])) #Take labels from config if len(self.config.get('dataCell', 'labels')) > 0 : labels = self.config.get('dataCell', 'labels').split(':::') for label in labels : labelProperties = label.split('-->') if len(labelProperties[0]) > 0 and len(labelProperties[1]) > 0 : self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDFS.label, Literal(labelProperties[1],labelProperties[0]))) if len(self.config.get('dataCell', 'literalType')) > 0 : self.graph.add((self.namespaces['tablink'][self.dataCellPropertyName], RDFS.range, URIRef(self.config.get('dataCell', 'literalType')))) def setScope(self, fileBasename): """Set the default namespace and base for all URIs of the current workbook""" self.fileBasename = fileBasename scopeNamespace = self.defaultNamespacePrefix + fileBasename + '/' # Annotations go to a different namespace annotationScopeNamespace = self.annotationsNamespacePrefix + fileBasename + '/' self.log.debug('Adding namespace for {0}: {1}'.format(fileBasename, scopeNamespace)) self.namespaces['scope'] = Namespace(scopeNamespace) self.annotationNamespaces['scope'] = Namespace(annotationScopeNamespace) self.graph.namespace_manager.bind('', self.namespaces['scope']) self.annotationGraph.namespace_manager.bind('', self.annotationNamespaces['scope']) def doLink(self): """Start tablinker for all sheets in workbook""" self.log.info('Starting TabLinker for all sheets in workbook') for n in range(self.rb.nsheets) : self.log.info('Starting with sheet {0}'.format(n)) self.r_sheet = self.rb.sheet_by_index(n) self.w_sheet = self.wb.get_sheet(n) self.rowns, self.colns = self.getValidRowsCols() self.sheet_qname = urllib.quote(re.sub('\s','_',self.r_sheet.name)) self.log.info('Base for QName generator set to: {0}'.format(self.sheet_qname)) self.log.debug('Starting parser') self.parseSheet() ### # Utility Functions ### def insideMergeBox(self, i, j): """ Check if the specified cell is inside a merge box Arguments: i -- row j -- column Returns: True/False -- depending on whether the cell is inside a merge box """ self.merged_cells = self.r_sheet.merged_cells for crange in self.merged_cells: rlo, rhi, clo, chi = crange if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo: return True return False def getMergeBoxCoord(self, i, j): """ Get the top-left corner cell of the merge box containing the specified cell Arguments: i -- row j -- column Returns: (k, l) -- Coordinates of the top-left corner of the merge box """ if not self.insideMergeBox(i,j): return (-1, -1) self.merged_cells = self.r_sheet.merged_cells for crange in self.merged_cells: rlo, rhi, clo, chi = crange if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo: return (rlo, clo) def getType(self, style): """Get type for a given excel style. Style name must be prefixed by 'TL ' Arguments: style -- Style (string) to check type for Returns: String -- The type of this field. In case none is found, 'unknown' """ typematch = re.search('TL\s(.*)',style) if typematch : cellType = typematch.group(1) else : cellType = 'Unknown' return cellType def isEmpty(self, i,j): """Check whether cell is empty. Arguments: i -- row j -- column Returns: True/False -- depending on whether the cell is empty """ if (self.r_sheet.cell(i,j).ctype == XL_CELL_EMPTY or self.r_sheet.cell(i,j).ctype == XL_CELL_BLANK) or self.r_sheet.cell(i,j).value == '' : return True else : return False def isEmptyRow(self, i, colns): """ Determine whether the row 'i' is empty by iterating over all its cells Arguments: i -- The index of the row to be checked. colns -- The number of columns to be checked Returns: true -- if the row is empty false -- if the row is not empty """ for j in range(0,colns) : if not self.isEmpty(i,j): return False return True def isEmptyColumn(self, j, rowns ): """ Determine whether the column 'j' is empty by iterating over all its cells Arguments: j -- The index of the column to be checked. rowns -- The number of rows to be checked Returns: true -- if the column is empty false -- if the column is not empty """ for i in range(0,rowns) : if not self.isEmpty(i,j): return False return True def getValidRowsCols(self) : """ Determine the number of non-empty rows and columns in the Excel sheet Returns: rowns -- number of rows colns -- number of columns """ colns = number_of_good_cols(self.r_sheet) rowns = number_of_good_rows(self.r_sheet) # Check whether the number of good columns and rows are correct while self.isEmptyRow(rowns-1, colns) : rowns = rowns - 1 while self.isEmptyColumn(colns-1, rowns) : colns = colns - 1 self.log.debug('Number of rows with content: {0}'.format(rowns)) self.log.debug('Number of columns with content: {0}'.format(colns)) return rowns, colns def getQName(self, names): """ Create a valid QName from a string or dictionary of names Arguments: names -- Either dictionary of names or string of a name. Returns: qname -- a valid QName for the dictionary or string """ if type(names) == dict : qname = self.sheet_qname for k in names : qname = qname + '_' + self.processString(names[k]) else : qname = self.sheet_qname + '_' + self.processString(names) self.log.debug('Minted new QName: {}'.format(qname)) return qname def getColHeaderLabel(self, colheaders): label = '_'.join(colheaders) return label def getColHeaderValueURI(self, colheaders): label = self.getColHeaderLabel(colheaders) uri = self.namespaces['scope'][self.processString(label)] return uri def getColHeaderPropertyURI(self, index): uri = self.namespaces['scope']['HColHeader' + str(index)] return uri def processString(self, string): """ Remove illegal characters (comma, brackets, etc) from string, and replace it with underscore. Useful for URIs Arguments: string -- The string representing the value of the source cell Returns: processedString -- The processed string """ # TODO accents too return urllib.quote(re.sub('\s|\(|\)|,|\.','_',unicode(string).strip().replace('/', '-')).encode('utf-8', 'ignore')) def addValue(self, source_cell_value, altLabel=None): """ Add a "value" + optional label to the graph for a cell in the source Excel sheet. The value is typically the value stored in the source cell itself, but may also be a copy of another cell (e.g. in the case of 'idem.'). Arguments: source_cell_value -- The string representing the value of the source cell Returns: source_cell_value_qname -- a valid QName for the value of the source cell """ source_cell_value_qname = self.getQName(source_cell_value) #self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['qb']['dataSet'],self.namespaces['scope'][self.sheet_qname])) #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['value'],self.namespaces['scope'][source_cell_value_qname])) # If the source_cell_value is actually a dictionary (e.g. in the case of HierarchicalRowHeaders), then use the last element of the row hierarchy as prefLabel # Otherwise just use the source_cell_value as prefLabel if type(source_cell_value) == dict : self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].prefLabel,Literal(source_cell_value.values()[-1],'nl'))) if altLabel and altLabel != source_cell_value.values()[-1]: # If altLabel has a value (typically for HierarchicalRowHeaders) different from the last element in the row hierarchy, we add it as alternative label. self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].altLabel,Literal(altLabel,'nl'))) else : self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].prefLabel,Literal(source_cell_value,'nl'))) if altLabel and altLabel != source_cell_value: # If altLabel has a value (typically for HierarchicalRowHeaders) different from the source_cell_value, we add it as alternative label. self.graph.add((self.namespaces['scope'][source_cell_value_qname],self.namespaces['skos'].altLabel,Literal(altLabel,'nl'))) return source_cell_value_qname def parseSheet(self): """ Parses the currently selected sheet in the workbook, takes no arguments. Iterates over all cells in the Excel sheet and produces relevant RDF Triples. """ self.log.info("Parsing {0} rows and {1} columns.".format(self.rowns,self.colns)) self.column_dimensions = {} self.property_dimensions = {} self.row_dimensions = {} self.rowhierarchy = {} # Get dictionary of annotations self.annotations = self.r_sheet.cell_note_map for i in range(0,self.rowns): self.rowhierarchy[i] = {} for j in range(0, self.colns): # Parse cell data self.source_cell = self.r_sheet.cell(i,j) self.source_cell_name = cellname(i,j) self.style = self.styles[self.source_cell].name self.cellType = self.getType(self.style) self.source_cell_qname = self.getQName(self.source_cell_name) self.log.debug("({},{}) {}/{}: \"{}\"". format(i,j,self.cellType, self.source_cell_name, self.source_cell.value)) # Try to parse ints to avoid ugly _0 URIs try: if int(self.source_cell.value) == self.source_cell.value: self.source_cell.value = int(self.source_cell.value) except ValueError: self.log.debug("(%s.%s) No parseable int" % (i,j)) # Parse annotation (if any) if self.config.get('annotations', 'enabled') == "1": if (i,j) in self.annotations: self.parseAnnotation(i, j) # Parse cell even if empty if self.cellType == 'Data': self.parseData(i, j) elif (self.cellType == 'HRowHeader') : self.updateRowHierarchy(i, j) elif self.cellType == 'ColHeader' : self.parseColHeader(i, j) elif self.cellType == 'RowProperty' : self.parseRowProperty(i, j) # If cell not empty, check for more types if not self.isEmpty(i,j) : #self.graph.add((self.namespaces['scope'][self.source_cell_qname],RDF.type,self.namespaces['tablink'][self.cellType])) #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['cell'],Literal(self.source_cell_name))) #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['col'],Literal(colname(j)))) #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['row'],Literal(i+1))) #self.graph.add((self.namespaces['scope'][self.source_cell_qname] isrow row if self.cellType == 'Title' : self.parseTitle(i, j) elif self.cellType == 'RowHeader' : self.parseRowHeader(i, j) elif self.cellType == 'HRowHeader' : self.parseHierarchicalRowHeader(i, j) elif self.cellType == 'RowLabel' : self.parseRowLabel(i, j) # Add additional information about the hierarchy of column headers for value in self.column_dimensions.values(): for index in range(1, len(value)): uri_sub = self.getColHeaderValueURI(value[:index+1]) uri_top = self.getColHeaderValueURI(value[:index]) self.graph.add((uri_sub, self.namespaces['tablink']['subColHeaderOf'], uri_top)) self.graph.add((uri_sub, self.namespaces['tablink']['depth'], Literal(index))) self.graph.add((uri_top, self.namespaces['tablink']['depth'], Literal(index-1))) self.log.info("Done parsing...") def updateRowHierarchy(self, i, j) : """ Build up lists for hierarchical row headers. Cells marked as hierarchical row header are often empty meaning that their intended value is stored somewhere else in the Excel sheet. Keyword arguments: int i -- row number int j -- col number Returns: New row hierarchy dictionary """ if (self.isEmpty(i,j) or str(self.source_cell.value).lower().strip() == 'id.') : # If the cell is empty, and a HierarchicalRowHeader, add the value of the row header above it. # If the cell above is not in the rowhierarchy, don't do anything. # If the cell is exactly 'id.', add the value of the row header above it. try : self.rowhierarchy[i][j] = self.rowhierarchy[i-1][j] self.log.debug("({},{}) Copied from above\nRow hierarchy: {}".format(i,j,self.rowhierarchy[i])) except : # REMOVED because of double slashes in uris # self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Top row, added nothing\nRow hierarchy: {}".format(i,j,self.rowhierarchy[i])) elif str(self.source_cell.value).lower().startswith('id.') or str(self.source_cell.value).lower().startswith('id '): # If the cell starts with 'id.', add the value of the row above it, and append the rest of the cell's value. suffix = self.source_cell.value[3:] try : self.rowhierarchy[i][j] = self.rowhierarchy[i-1][j]+suffix self.log.debug("({},{}) Copied from above+suffix\nRow hierarchy {}".format(i,j,self.rowhierarchy[i])) except : self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Top row, added value\nRow hierarchy {}".format(i,j,self.rowhierarchy[i])) elif not self.isEmpty(i,j) : self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Added value\nRow hierarchy {}".format(i,j,self.rowhierarchy[i])) return self.rowhierarchy def parseHierarchicalRowHeader(self, i, j) : """ Create relevant triples for the cell marked as HierarchicalRowHeader (i, j are row and column) """ # Use the rowhierarchy to create a unique qname for the cell's contents, # give the source_cell's original value as extra argument self.log.debug("Parsing HierarchicalRowHeader") # Add all the values for (index, value) in self.rowhierarchy[i].items(): prop = self.property_dimensions[index] self.row_dimensions.setdefault(i,{}) self.row_dimensions[i][self.namespaces['scope'][prop]]= Literal(value) # Relate the hierarchical headers keys = self.rowhierarchy[i].keys() for i in range(len(keys)-1): prop_top = self.namespaces['scope'][self.property_dimensions[keys[i]]] prop_sub = self.namespaces['scope'][self.property_dimensions[keys[i+1]]] self.graph.add((prop_sub, self.namespaces['tablink']['subPropertyOf'], prop_top)) def parseRowLabel(self, i, j): """ Create relevant triples for the cell marked as Label (i, j are row and column) """ self.log.debug("Parsing Row Label") # Get the QName of the HierarchicalRowHeader cell that this label belongs to, based on the rowhierarchy for this row (i) hierarchicalRowHeader_value_qname = self.getQName(self.rowhierarchy[i]) prefLabels = self.graph.objects(self.namespaces['scope'][hierarchicalRowHeader_value_qname], self.namespaces['skos'].prefLabel) for label in prefLabels : # If the hierarchicalRowHeader QName already has a preferred label, turn it into a skos:altLabel self.graph.remove((self.namespaces['scope'][hierarchicalRowHeader_value_qname],self.namespaces['skos'].prefLabel,label)) self.graph.add((self.namespaces['scope'][hierarchicalRowHeader_value_qname],self.namespaces['skos'].altLabel,label)) self.log.debug("Turned skos:prefLabel {} for {} into a skos:altLabel".format(label, hierarchicalRowHeader_value_qname)) # Add the value of the label cell as skos:prefLabel to the header cell # self.graph.add((self.namespaces['scope'][hierarchicalRowHeader_value_qname], self.namespaces['skos'].prefLabel, Literal(self.source_cell.value, 'nl'))) # Record that this source_cell_qname is the label for the HierarchicalRowHeader cell # self.graph.add((self.namespaces['scope'][self.source_cell_qname], self.namespaces['tablink']['isLabel'], self.namespaces['scope'][hierarchicalRowHeader_value_qname])) def parseRowHeader(self, i, j) : """ Create relevant triples for the cell marked as RowHeader (i, j are row and column) """ rowHeaderValue = "" # Don't attach the cell value to the namespace if it's already a URI isURI = urlparse(str(self.source_cell.value)) if isURI.scheme and isURI.netloc: rowHeaderValue = URIRef(self.source_cell.value) else: self.source_cell_value_qname = self.source_cell.value rowHeaderValue = Literal(self.source_cell_value_qname) # Get the properties to use for the row headers prop = self.property_dimensions[j] self.row_dimensions.setdefault(i,{}) self.row_dimensions[i][self.namespaces['scope'][prop]]= rowHeaderValue return def parseColHeader(self, i, j) : """ Create relevant triples for the cell marked as Header (i, j are row and column) """ cell_content = self.processString(self.source_cell.value) if self.isEmpty(i,j): if self.insideMergeBox(i,j): k, l = self.getMergeBoxCoord(i,j) # If we are in a vertical merge box, skip adding the dimension if l == j: return # Update cell content cell_content = self.processString(self.r_sheet.cell(k,l).value) else: return # Add the value qname to the column_dimensions list for that column self.column_dimensions.setdefault(j,[self.sheet_qname]).append(cell_content) # Add the data to the graph resource = self.getColHeaderValueURI(self.column_dimensions[j]) self.graph.add((resource, RDF.type, self.namespaces['tablink']['ColumnHeader'])) self.graph.add((resource, self.namespaces['skos']['prefLabel'], Literal(cell_content))) self.graph.add((resource, self.namespaces['tablink']['cell'], Literal(self.source_cell_name))) return def parseRowProperty(self, i, j) : """ Create relevant triples for the cell marked as Property (i, j are row and column) """ if self.isEmpty(i,j): if self.insideMergeBox(i,j): k, l = self.getMergeBoxCoord(i,j) self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k,l).value) else: return else: self.source_cell_value_qname = self.addValue(self.source_cell.value) #self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['tablink']['isDimensionProperty'],self.namespaces['scope'][self.source_cell_value_qname])) #self.graph.add((self.namespaces['scope'][self.source_cell_value_qname],RDF.type,self.namespaces['qb']['DimensionProperty'])) #self.graph.add((self.namespaces['scope'][self.source_cell_value_qname],RDF.type,RDF['Property'])) #self.property_dimensions.setdefault(j,[]).append(self.source_cell_value_qname) self.property_dimensions[j] = self.source_cell_value_qname # Add to graph resource = self.namespaces['scope'][self.property_dimensions[j]] self.graph.add((resource, RDF.type, self.namespaces['tablink']['RowProperty'])) return def parseTitle(self, i, j) : """ Create relevant triples for the cell marked as Title (i, j are row and column) """ self.graph.add((self.namespaces['scope'][self.sheet_qname], self.namespaces['tablink']['title'], Literal(self.source_cell.value))) return def parseData(self, i,j) : """ Create relevant triples for the cell marked as Data (i, j are row and column) """ if self.isEmpty(i,j) and self.config.get('dataCell', 'implicitZeros') == '0': return # Use the fully qualified name of the cell for the resource name observation = self.namespaces['scope'][self.source_cell_qname] # It's an observation self.graph.add((observation, RDF.type, self.namespaces['qb']['Observation'])) # It's in the data set defined by the current sheet self.graph.add((observation, self.namespaces['qb']['dataSet'], self.namespaces['scope'][self.sheet_qname])) # Add it's value # TODO type the value if self.isEmpty(i,j) and self.config.get('dataCell', 'implicitZeros') == '1': self.graph.add((observation, self.namespaces['scope'][self.dataCellPropertyName], Literal(0))) else: self.graph.add((observation, self.namespaces['scope'][self.dataCellPropertyName], Literal(self.source_cell.value))) # Use the row dimensions dictionary to find the properties that link # data values to row headers try : for (prop, value) in self.row_dimensions[i].iteritems() : self.graph.add((observation, prop, value)) except KeyError : self.log.debug("({}.{}) No row dimension for cell".format(i,j)) # Use the column dimensions dictionary to find the objects of the # d2s:dimension property self.graph.add((observation, self.namespaces['tablink']['dimension'], self.getColHeaderValueURI(self.column_dimensions[j]))) def parseAnnotation(self, i, j) : """ Create relevant triples for the annotation attached to cell (i, j) """ if self.config.get('annotations', 'model') == 'oa': # Create triples according to Open Annotation model body = BNode() self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], RDF.type, self.annotationNamespaces['oa']['Annotation'] )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['hasBody'], body )) self.annotationGraph.add((body, RDF.value, Literal(self.annotations[(i,j)].text.replace("\n", " ").replace("\r", " ").replace("\r\n", " ").encode('utf-8')) )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['hasTarget'], self.namespaces['scope'][self.source_cell_qname] )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['annotator'], Literal(self.annotations[(i,j)].author.encode('utf-8')) )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['annotated'], Literal(datetime.datetime.fromtimestamp(os.path.getmtime(self.filename)).strftime("%Y-%m-%d"),datatype=self.annotationNamespaces['xsd']['date']) )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['generator'], URIRef("https://github.com/Data2Semantics/TabLinker") )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['generated'], Literal(datetime.datetime.now().strftime("%Y-%m-%d"), datatype=self.annotationNamespaces['xsd']['date']) )) self.annotationGraph.add((self.annotationNamespaces['scope'][self.source_cell_qname], self.annotationNamespaces['oa']['modelVersion'], URIRef("http://www.openannotation.org/spec/core/20120509.html") )) else: # Create triples according to Nanopublications model print "Nanopublications not implemented yet!"
class CondorFastqExtract(object): def __init__(self, host, sequences_path, log_path='log', model=None, compression=None, force=False): """Extract fastqs from results archive Args: host (str): root of the htsworkflow api server apidata (dict): id & key to post to the server sequences_path (str): root of the directory tree to scan for files log_path (str): where to put condor log files compression (str): one of 'gzip', 'bzip2' force (bool): do we force overwriting current files? """ self.host = host if model is None: self.model = ConjunctiveGraph() else: self.model = model self.sequences_path = sequences_path self.log_path = log_path self.compression=compression self.force = force LOGGER.info("CondorFastq host={0}".format(self.host)) LOGGER.info("CondorFastq sequences_path={0}".format(self.sequences_path)) LOGGER.info("CondorFastq log_path={0}".format(self.log_path)) LOGGER.info("Compression {0}".format(self.compression)) def create_scripts(self, result_map ): """ Generate condor scripts to build any needed fastq files Args: result_map: htsworkflow.submission.results.ResultMap() """ template_map = {'srf': 'srf.condor', 'qseq': 'qseq.condor', 'split_fastq': 'split_fastq.condor', } env = None pythonpath = os.environ.get('PYTHONPATH', None) if pythonpath is not None: env = "PYTHONPATH=%s" % (pythonpath,) condor_entries = self.build_condor_arguments(result_map) for script_type in template_map.keys(): template = loader.get_template(template_map[script_type]) context = {'python': sys.executable, 'logdir': self.log_path, 'env': env, 'args': condor_entries[script_type], 'root_url': self.host, } with open(script_type + '.condor','w+') as outstream: outstream.write(template.render(context)) def build_condor_arguments(self, result_map): condor_entries = {'srf': [], 'qseq': [], 'split_fastq': []} conversion_funcs = {'srf': self.condor_srf_to_fastq, 'qseq': self.condor_qseq_to_fastq, 'split_fastq': self.condor_desplit_fastq } sequences = self.find_archive_sequence_files(result_map) needed_targets = self.update_fastq_targets(result_map, sequences) for target_pathname, available_sources in needed_targets.items(): LOGGER.debug(' target : %s' % (target_pathname,)) LOGGER.debug(' candidate sources: %s' % (available_sources,)) for condor_type in available_sources.keys(): conversion = conversion_funcs.get(condor_type, None) if conversion is None: errmsg = "Unrecognized type: {0} for {1}" LOGGER.error(errmsg.format(condor_type, pformat(available_sources))) continue sources = available_sources.get(condor_type, None) if sources is not None: condor_entries.setdefault(condor_type, []).append( conversion(sources, target_pathname)) else: LOGGER.warning(" need file %s", target_pathname) return condor_entries def find_archive_sequence_files(self, result_map): """ Find archived sequence files associated with our results. """ self.import_libraries(result_map) flowcell_ids = self.find_relevant_flowcell_ids() self.import_sequences(flowcell_ids) query_text = """ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> prefix xsd: <http://www.w3.org/2001/XMLSchema#> select ?filenode ?filetype ?cycle ?lane_number ?read ?library ?library_id ?flowcell ?flowcell_id ?read_length ?flowcell_type ?flowcell_status where { ?filenode libns:cycle ?cycle ; libns:lane_number ?lane_number ; libns:read ?read ; libns:flowcell ?flowcell ; libns:flowcell_id ?flowcell_id ; libns:library ?library ; libns:library_id ?library_id ; libns:file_type ?filetype ; a libns:IlluminaResult . ?flowcell libns:read_length ?read_length ; libns:flowcell_type ?flowcell_type . OPTIONAL { ?flowcell libns:flowcell_status ?flowcell_status } FILTER(?filetype != libns:sequencer_result) } """ LOGGER.debug("find_archive_sequence_files query: %s", query_text) results = [] for r in self.model.query(query_text): library_id = r['library_id'].toPython() if library_id in result_map: seq = SequenceResult(r) LOGGER.debug("Creating sequence result for library %s: %s", library_id, repr(seq)) results.append(seq) return results def import_libraries(self, result_map): for lib_id in result_map.keys(): liburl = urljoin(self.host, 'library/%s/' % (lib_id,)) library = URIRef(liburl) self.import_library(library) def import_library(self, library): """Import library data into our model if we don't have it already """ q = (library, RDF['type'], libraryOntology['Library']) present = False if q not in self.model: present = True self.model.parse(source=library, format='rdfa') LOGGER.debug("Did we import %s: %s", str(library), present) def find_relevant_flowcell_ids(self): """Generate set of flowcell ids that had samples of interest on them """ flowcell_query = """ prefix libns: <http://jumpgate.caltech.edu/wiki/LibraryOntology#> select distinct ?flowcell ?flowcell_id WHERE { ?library a libns:Library ; libns:has_lane ?lane . ?lane libns:flowcell ?flowcell . ?flowcell libns:flowcell_id ?flowcell_id . }""" flowcell_ids = set() for r in self.model.query(flowcell_query): flowcell_ids.add(r['flowcell_id'].toPython()) imported = False a_lane = list(self.model.objects(r['flowcell'], libraryOntology['has_lane'])) if len(a_lane) == 0: imported = True # we lack information about which lanes were on this flowcell self.model.parse(r['flowcell'], format='rdfa') LOGGER.debug("Did we imported %s: %s" % (str(r['flowcell']), imported)) return flowcell_ids def import_sequences(self, flowcell_ids): seq_dirs = [] for f in flowcell_ids: seq_dirs.append(os.path.join(self.sequences_path, str(f))) sequences = scan_for_sequences(seq_dirs) for seq in sequences: seq.save_to_model(self.model, self.host) update_model_sequence_library(self.model, self.host) def update_fastq_targets(self, result_map, raw_files): """Return list of fastq files that need to be built. Also update model with link between illumina result files and our target fastq file. """ # find what targets we're missing needed_targets = {} for seq in raw_files: if not seq.isgood: continue filename_attributes = { 'flowcell': seq.flowcell_id, 'lib_id': seq.library_id, 'lane': seq.lane_number, 'read': seq.read, 'cycle': seq.cycle, 'compression_extension': COMPRESSION_EXTENSIONS[self.compression], 'is_paired': seq.ispaired } fqName = FastqName(**filename_attributes) result_dir = result_map[seq.library_id] target_pathname = os.path.join(result_dir, fqName.filename) if self.force or not os.path.exists(target_pathname): t = needed_targets.setdefault(target_pathname, {}) t.setdefault(seq.filetype, []).append(seq) self.add_target_source_links(target_pathname, seq) return needed_targets def add_target_source_links(self, target, seq): """Add link between target pathname and the 'lane' that produced it (note lane objects are now post demultiplexing.) """ target_uri = 'file://' + smart_str(target) target_node = URIRef(target_uri) source_stmt = (target_node, DC['source'], seq.filenode) self.model.add(source_stmt) def condor_srf_to_fastq(self, sources, target_pathname): if len(sources) > 1: raise ValueError("srf to fastq can only handle one file") mid_point = None if sources[0].flowcell_id == '30DY0AAXX': mid_point = 76 return { 'sources': [sources[0].path], 'pyscript': srf2fastq.__file__, 'flowcell': sources[0].flowcell_id, 'ispaired': sources[0].ispaired, 'target': target_pathname, 'target_right': target_pathname.replace('_r1.fastq', '_r2.fastq'), 'mid': mid_point, 'force': self.force, } def condor_qseq_to_fastq(self, sources, target_pathname): paths = [] for source in sources: paths.append(source.path) paths.sort() compression_argument = self.format_compression_flag() return { 'pyscript': qseq2fastq.__file__, 'flowcell': sources[0].flowcell_id, 'target': target_pathname, 'compression': compression_argument, 'sources': paths, 'ispaired': sources[0].ispaired, 'istar': len(sources) == 1, } def condor_desplit_fastq(self, sources, target_pathname): paths = [] for source in sources: paths.append(source.path) paths.sort() compression_argument = self.format_compression_flag() return { 'pyscript': desplit_fastq.__file__, 'target': target_pathname, 'compression': compression_argument, 'sources': paths, 'ispaired': sources[0].ispaired, } def format_compression_flag(self): return '--'+self.compression if self.compression else ''
class TabLinker(object): defaultNamespacePrefix = "http://lod.cedar-project.nl/resource/" annotationsNamespacePrefix = "http://lod.cedar-project.nl/annotations/" namespaces = { "dcterms": Namespace("http://purl.org/dc/terms/"), "skos": Namespace("http://www.w3.org/2004/02/skos/core#"), "d2s": Namespace("http://lod.cedar-project.nl/core/"), "qb": Namespace("http://purl.org/linked-data/cube#"), "owl": Namespace("http://www.w3.org/2002/07/owl#"), } annotationNamespaces = { "np": Namespace("http://www.nanopub.org/nschema#"), "oa": Namespace("http://www.w3.org/ns/openannotation/core/"), "xsd": Namespace("http://www.w3.org/2001/XMLSchema#"), "dct": Namespace("http://purl.org/dc/terms/"), } def __init__(self, filename, config, level=logging.DEBUG): """TabLinker constructor Keyword arguments: filename -- String containing the name of the current Excel file being examined config -- Configuration object, loaded from .ini file level -- A logging level as defined in the logging module """ self.config = config self.filename = filename self.log = logging.getLogger("TabLinker") self.log.setLevel(level) self.log.debug("Initializing Graphs") self.initGraphs() self.log.debug("Setting Scope") basename = os.path.basename(filename) basename = re.search("(.*)\.xls", basename).group(1) self.setScope(basename) self.log.debug("Loading Excel file {0}.".format(filename)) self.rb = open_workbook(filename, formatting_info=True) self.log.debug("Reading styles") self.styles = Styles(self.rb) self.log.debug("Copied Workbook to writable copy") self.wb = copy(self.rb) def initGraphs(self): """Initialize the graphs, set default namespaces, and add schema information""" self.graph = ConjunctiveGraph() # Create a separate graph for annotations self.annotationGraph = ConjunctiveGraph() self.log.debug("Adding namespaces to graphs") # Bind namespaces to graphs for namespace in self.namespaces: self.graph.namespace_manager.bind(namespace, self.namespaces[namespace]) # Same for annotation graph for namespace in self.annotationNamespaces: self.annotationGraph.namespace_manager.bind(namespace, self.annotationNamespaces[namespace]) self.log.debug("Adding some schema information (dimension and measure properties) ") self.addDataCellProperty() self.graph.add((self.namespaces["d2s"]["dimension"], RDF.type, self.namespaces["qb"]["DimensionProperty"])) self.graph.add((self.namespaces["d2s"]["label"], RDF.type, RDF["Property"])) def addDataCellProperty(self): """Add definition of data cell resource to graph""" if len(self.config.get("dataCell", "propertyName")) > 0: self.dataCellPropertyName = self.config.get("dataCell", "propertyName") else: self.dataCellPropertyName = "hasValue" self.graph.add( (self.namespaces["d2s"][self.dataCellPropertyName], RDF.type, self.namespaces["qb"]["MeasureProperty"]) ) # Take labels from config if len(self.config.get("dataCell", "labels")) > 0: labels = self.config.get("dataCell", "labels").split(":::") for label in labels: labelProperties = label.split("-->") if len(labelProperties[0]) > 0 and len(labelProperties[1]) > 0: self.graph.add( ( self.namespaces["d2s"][self.dataCellPropertyName], RDFS.label, Literal(labelProperties[1], labelProperties[0]), ) ) if len(self.config.get("dataCell", "literalType")) > 0: self.graph.add( ( self.namespaces["d2s"][self.dataCellPropertyName], RDFS.range, URIRef(self.config.get("dataCell", "literalType")), ) ) def setScope(self, fileBasename): """Set the default namespace and base for all URIs of the current workbook""" self.fileBasename = fileBasename scopeNamespace = self.defaultNamespacePrefix + fileBasename + "/" # Annotations go to a different namespace annotationScopeNamespace = self.annotationsNamespacePrefix + fileBasename + "/" self.log.debug("Adding namespace for {0}: {1}".format(fileBasename, scopeNamespace)) self.namespaces["scope"] = Namespace(scopeNamespace) self.annotationNamespaces["scope"] = Namespace(annotationScopeNamespace) self.graph.namespace_manager.bind("", self.namespaces["scope"]) self.annotationGraph.namespace_manager.bind("", self.annotationNamespaces["scope"]) def doLink(self): """Start tablinker for all sheets in workbook""" self.log.info("Starting TabLinker for all sheets in workbook") for n in range(self.rb.nsheets): self.log.debug("Starting with sheet {0}".format(n)) self.r_sheet = self.rb.sheet_by_index(n) self.w_sheet = self.wb.get_sheet(n) self.rowns, self.colns = self.getValidRowsCols() self.sheet_qname = urllib.quote(re.sub("\s", "_", self.r_sheet.name)) self.log.debug("Base for QName generator set to: {0}".format(self.sheet_qname)) self.log.debug("Starting parser") self.parseSheet() ### # Utility Functions ### def insideMergeBox(self, i, j): """ Check if the specified cell is inside a merge box Arguments: i -- row j -- column Returns: True/False -- depending on whether the cell is inside a merge box """ self.merged_cells = self.r_sheet.merged_cells for crange in self.merged_cells: rlo, rhi, clo, chi = crange if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo: return True return False def getMergeBoxCoord(self, i, j): """ Get the top-left corner cell of the merge box containing the specified cell Arguments: i -- row j -- column Returns: (k, l) -- Coordinates of the top-left corner of the merge box """ if not self.insideMergeBox(i, j): return (-1, -1) self.merged_cells = self.r_sheet.merged_cells for crange in self.merged_cells: rlo, rhi, clo, chi = crange if i <= rhi - 1 and i >= rlo and j <= chi - 1 and j >= clo: return (rlo, clo) def getType(self, style): """Get type for a given excel style. Style name must be prefixed by 'TL ' Arguments: style -- Style (string) to check type for Returns: String -- The type of this field. In case none is found, 'unknown' """ typematch = re.search("TL\s(.*)", style) if typematch: cellType = typematch.group(1) else: cellType = "Unknown" return cellType def isEmpty(self, i, j): """Check whether cell is empty. Arguments: i -- row j -- column Returns: True/False -- depending on whether the cell is empty """ if ( self.r_sheet.cell(i, j).ctype == XL_CELL_EMPTY or self.r_sheet.cell(i, j).ctype == XL_CELL_BLANK ) or self.r_sheet.cell(i, j).value == "": return True else: return False def isEmptyRow(self, i, colns): """ Determine whether the row 'i' is empty by iterating over all its cells Arguments: i -- The index of the row to be checked. colns -- The number of columns to be checked Returns: true -- if the row is empty false -- if the row is not empty """ for j in range(0, colns): if not self.isEmpty(i, j): return False return True def isEmptyColumn(self, j, rowns): """ Determine whether the column 'j' is empty by iterating over all its cells Arguments: j -- The index of the column to be checked. rowns -- The number of rows to be checked Returns: true -- if the column is empty false -- if the column is not empty """ for i in range(0, rowns): if not self.isEmpty(i, j): return False return True def getValidRowsCols(self): """ Determine the number of non-empty rows and columns in the Excel sheet Returns: rowns -- number of rows colns -- number of columns """ colns = number_of_good_cols(self.r_sheet) rowns = number_of_good_rows(self.r_sheet) # Check whether the number of good columns and rows are correct while self.isEmptyRow(rowns - 1, colns): rowns = rowns - 1 while self.isEmptyColumn(colns - 1, rowns): colns = colns - 1 self.log.debug("Number of rows with content: {0}".format(rowns)) self.log.debug("Number of columns with content: {0}".format(colns)) return rowns, colns def getQName(self, names): """ Create a valid QName from a string or dictionary of names Arguments: names -- Either dictionary of names or string of a name. Returns: qname -- a valid QName for the dictionary or string """ if type(names) == dict: qname = self.sheet_qname for k in names: qname = qname + "/" + self.processString(names[k]) else: qname = self.sheet_qname + "/" + self.processString(names) self.log.debug("Minted new QName: {}".format(qname)) return qname def processString(self, string): """ Remove illegal characters (comma, brackets, etc) from string, and replace it with underscore. Useful for URIs Arguments: string -- The string representing the value of the source cell Returns: processedString -- The processed string """ return urllib.quote(re.sub("\s|\(|\)|,|\.", "_", unicode(string).strip()).encode("utf-8", "ignore")) def addValue(self, source_cell_value, altLabel=None): """ Add a "value" + optional label to the graph for a cell in the source Excel sheet. The value is typically the value stored in the source cell itself, but may also be a copy of another cell (e.g. in the case of 'idem.'). Arguments: source_cell_value -- The string representing the value of the source cell Returns: source_cell_value_qname -- a valid QName for the value of the source cell """ source_cell_value_qname = self.getQName(source_cell_value) self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["qb"]["dataSet"], self.namespaces["scope"][self.sheet_qname], ) ) self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["value"], self.namespaces["scope"][source_cell_value_qname], ) ) # If the source_cell_value is actually a dictionary (e.g. in the case of HierarchicalRowHeaders), then use the last element of the row hierarchy as prefLabel # Otherwise just use the source_cell_value as prefLabel if type(source_cell_value) == dict: self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["skos"].prefLabel, Literal(source_cell_value.values()[-1], "nl"), ) ) if altLabel and altLabel != source_cell_value.values()[-1]: # If altLabel has a value (typically for HierarchicalRowHeaders) different from the last element in the row hierarchy, we add it as alternative label. self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["skos"].altLabel, Literal(altLabel, "nl"), ) ) else: # Try to parse a date to add the appropriate datatype to the literal try: isodate.parse_datetime(source_cell_value) self.log.debug("Datetime on this cell: %s" % source_cell_value) self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["skos"].prefLabel, Literal(source_cell_value, datatype=XSD.datetime), ) ) except (ValueError, isodate.isoerror.ISO8601Error, AttributeError): self.log.debug("No datetime on this cell") self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["skos"].prefLabel, Literal(source_cell_value, "nl"), ) ) if altLabel and altLabel != source_cell_value: # If altLabel has a value (typically for HierarchicalRowHeaders) different from the source_cell_value, we add it as alternative label. self.graph.add( ( self.namespaces["scope"][source_cell_value_qname], self.namespaces["skos"].altLabel, Literal(altLabel, "nl"), ) ) return source_cell_value_qname def parseSheet(self): """ Parses the currently selected sheet in the workbook, takes no arguments. Iterates over all cells in the Excel sheet and produces relevant RDF Triples. """ self.log.info("Parsing {0} rows and {1} columns.".format(self.rowns, self.colns)) self.column_dimensions = {} self.property_dimensions = {} self.row_dimensions = {} self.rowhierarchy = {} # Get dictionary of annotations self.annotations = self.r_sheet.cell_note_map for i in range(0, self.rowns): self.rowhierarchy[i] = {} for j in range(0, self.colns): # Parse cell data self.source_cell = self.r_sheet.cell(i, j) self.source_cell_name = cellname(i, j) self.style = self.styles[self.source_cell].name self.cellType = self.getType(self.style) self.source_cell_qname = self.getQName(self.source_cell_name) self.log.debug( '({},{}) {}/{}: "{}"'.format(i, j, self.cellType, self.source_cell_name, self.source_cell.value) ) # Try to parse ints to avoid ugly _0 URIs try: if int(self.source_cell.value) == self.source_cell.value: self.source_cell.value = int(self.source_cell.value) except ValueError: self.log.debug("(%s.%s) No parseable int" % (i, j)) # Parse annotation (if any) if self.config.get("annotations", "enabled") == "1": if (i, j) in self.annotations: self.parseAnnotation(i, j) # Parse even if empty if self.cellType == "HRowHeader": self.updateRowHierarchy(i, j) if self.cellType == "Data": self.parseData(i, j) if self.cellType == "ColHeader": self.parseColHeader(i, j) if self.cellType == "RowProperty": self.parseRowProperty(i, j) if not self.isEmpty(i, j): self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], RDF.type, self.namespaces["d2s"][self.cellType], ) ) self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["cell"], Literal(self.source_cell_name), ) ) # self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['d2s']['col'],Literal(colname(j)))) # self.graph.add((self.namespaces['scope'][self.source_cell_qname],self.namespaces['d2s']['row'],Literal(i+1))) # self.graph.add((self.namespaces['scope'][self.source_cell_qname] isrow row if self.cellType == "Title": self.parseTitle(i, j) elif self.cellType == "RowHeader": self.parseRowHeader(i, j) elif self.cellType == "HRowHeader": self.parseHierarchicalRowHeader(i, j) elif self.cellType == "RowLabel": self.parseRowLabel(i, j) self.log.info("Done parsing...") def updateRowHierarchy(self, i, j): """ Build up lists for hierarchical row headers. Cells marked as hierarchical row header are often empty meaning that their intended value is stored somewhere else in the Excel sheet. Keyword arguments: int i -- row number int j -- col number Returns: New row hierarchy dictionary """ if self.isEmpty(i, j) or str(self.source_cell.value).lower().strip() == "id.": # If the cell is empty, and a HierarchicalRowHeader, add the value of the row header above it. # If the cell above is not in the rowhierarchy, don't do anything. # If the cell is exactly 'id.', add the value of the row header above it. try: self.rowhierarchy[i][j] = self.rowhierarchy[i - 1][j] self.log.debug("({},{}) Copied from above\nRow hierarchy: {}".format(i, j, self.rowhierarchy[i])) except: # REMOVED because of double slashes in uris # self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Top row, added nothing\nRow hierarchy: {}".format(i, j, self.rowhierarchy[i])) elif str(self.source_cell.value).lower().startswith("id.") or str(self.source_cell.value).lower().startswith( "id " ): # If the cell starts with 'id.', add the value of the row above it, and append the rest of the cell's value. suffix = self.source_cell.value[3:] try: self.rowhierarchy[i][j] = self.rowhierarchy[i - 1][j] + suffix self.log.debug("({},{}) Copied from above+suffix\nRow hierarchy {}".format(i, j, self.rowhierarchy[i])) except: self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Top row, added value\nRow hierarchy {}".format(i, j, self.rowhierarchy[i])) elif not self.isEmpty(i, j): self.rowhierarchy[i][j] = self.source_cell.value self.log.debug("({},{}) Added value\nRow hierarchy {}".format(i, j, self.rowhierarchy[i])) return self.rowhierarchy def parseHierarchicalRowHeader(self, i, j): """ Create relevant triples for the cell marked as HierarchicalRowHeader (i, j are row and column) """ # Use the rowhierarchy to create a unique qname for the cell's contents, give the source_cell's original value as extra argument self.log.debug("Parsing HierarchicalRowHeader") self.source_cell_value_qname = self.addValue(self.rowhierarchy[i], altLabel=self.source_cell.value) # Now that we know the source cell's value qname, add a d2s:isDimension link and the skos:Concept type self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isDimension"], self.namespaces["scope"][self.source_cell_value_qname], ) ) self.graph.add((self.namespaces["scope"][self.source_cell_qname], RDF.type, self.namespaces["skos"].Concept)) hierarchy_items = self.rowhierarchy[i].items() try: parent_values = dict(hierarchy_items[:-1]) self.log.debug(i, j, "Parent value: " + str(parent_values)) parent_value_qname = self.getQName(parent_values) self.graph.add( ( self.namespaces["scope"][self.source_cell_value_qname], self.namespaces["skos"]["broader"], self.namespaces["scope"][parent_value_qname], ) ) except: self.log.debug(i, j, "Top of hierarchy") # Get the properties to use for the row headers try: properties = [] for dim_qname in self.property_dimensions[j]: properties.append(dim_qname) except KeyError: self.log.debug("({}.{}) No row dimension for cell".format(i, j)) self.row_dimensions.setdefault(i, []).append((self.source_cell_value_qname, properties)) def parseRowLabel(self, i, j): """ Create relevant triples for the cell marked as Label (i, j are row and column) """ self.log.debug("Parsing Row Label") # Get the QName of the HierarchicalRowHeader cell that this label belongs to, based on the rowhierarchy for this row (i) hierarchicalRowHeader_value_qname = self.getQName(self.rowhierarchy[i]) prefLabels = self.graph.objects( self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].prefLabel ) for label in prefLabels: # If the hierarchicalRowHeader QName already has a preferred label, turn it into a skos:altLabel self.graph.remove( (self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].prefLabel, label) ) self.graph.add( (self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].altLabel, label) ) self.log.debug( "Turned skos:prefLabel {} for {} into a skos:altLabel".format(label, hierarchicalRowHeader_value_qname) ) # Add the value of the label cell as skos:prefLabel to the header cell self.graph.add( ( self.namespaces["scope"][hierarchicalRowHeader_value_qname], self.namespaces["skos"].prefLabel, Literal(self.source_cell.value, "nl"), ) ) # Record that this source_cell_qname is the label for the HierarchicalRowHeader cell self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isLabel"], self.namespaces["scope"][hierarchicalRowHeader_value_qname], ) ) def parseRowHeader(self, i, j): """ Create relevant triples for the cell marked as RowHeader (i, j are row and column) """ rowHeaderValue = "" # Don't attach the cell value to the namespace if it's already a URI isURI = urlparse(str(self.source_cell.value)) if isURI.scheme and isURI.netloc: rowHeaderValue = URIRef(self.source_cell.value) else: self.source_cell_value_qname = self.addValue(self.source_cell.value) rowHeaderValue = self.namespaces["scope"][self.source_cell_value_qname] self.graph.add( (self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isDimension"], rowHeaderValue) ) self.graph.add((rowHeaderValue, RDF.type, self.namespaces["d2s"]["Dimension"])) self.graph.add((rowHeaderValue, RDF.type, self.namespaces["skos"].Concept)) # Get the properties to use for the row headers try: properties = [] for dim_qname in self.property_dimensions[j]: properties.append(dim_qname) except KeyError: self.log.debug("({}.{}) No properties for cell".format(i, j)) self.row_dimensions.setdefault(i, []).append((rowHeaderValue, properties)) # Use the column dimensions dictionary to find the objects of the d2s:dimension property try: for dim_qname in self.column_dimensions[j]: self.graph.add( (rowHeaderValue, self.namespaces["d2s"]["dimension"], self.namespaces["scope"][dim_qname]) ) except KeyError: self.log.debug("({}.{}) No column dimension for cell".format(i, j)) return def parseColHeader(self, i, j): """ Create relevant triples for the cell marked as Header (i, j are row and column) """ if self.isEmpty(i, j): if self.insideMergeBox(i, j): k, l = self.getMergeBoxCoord(i, j) self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k, l).value) else: return else: self.source_cell_value_qname = self.addValue(self.source_cell.value) self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isDimension"], self.namespaces["scope"][self.source_cell_value_qname], ) ) self.graph.add( (self.namespaces["scope"][self.source_cell_value_qname], RDF.type, self.namespaces["d2s"]["Dimension"]) ) self.graph.add((self.namespaces["scope"][self.source_cell_qname], RDF.type, self.namespaces["skos"].Concept)) # Add the value qname to the column_dimensions list for that column self.column_dimensions.setdefault(j, []).append(self.source_cell_value_qname) return def parseRowProperty(self, i, j): """ Create relevant triples for the cell marked as Property (i, j are row and column) """ if self.isEmpty(i, j): if self.insideMergeBox(i, j): k, l = self.getMergeBoxCoord(i, j) self.source_cell_value_qname = self.addValue(self.r_sheet.cell(k, l).value) else: return else: self.source_cell_value_qname = self.addValue(self.source_cell.value) self.graph.add( ( self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isDimensionProperty"], self.namespaces["scope"][self.source_cell_value_qname], ) ) self.graph.add( ( self.namespaces["scope"][self.source_cell_value_qname], RDF.type, self.namespaces["qb"]["DimensionProperty"], ) ) self.graph.add((self.namespaces["scope"][self.source_cell_value_qname], RDF.type, RDF["Property"])) self.property_dimensions.setdefault(j, []).append(self.source_cell_value_qname) return def parseTitle(self, i, j): """ Create relevant triples for the cell marked as Title (i, j are row and column) """ self.source_cell_value_qname = self.addValue(self.source_cell.value) self.graph.add( ( self.namespaces["scope"][self.sheet_qname], self.namespaces["d2s"]["title"], self.namespaces["scope"][self.source_cell_value_qname], ) ) self.graph.add( (self.namespaces["scope"][self.source_cell_value_qname], RDF.type, self.namespaces["d2s"]["Dimension"]) ) return def parseData(self, i, j): """ Create relevant triples for the cell marked as Data (i, j are row and column) """ if self.isEmpty(i, j) and self.config.get("dataCell", "implicitZeros") == "0": return observation = BNode() self.graph.add( (self.namespaces["scope"][self.source_cell_qname], self.namespaces["d2s"]["isObservation"], observation) ) self.graph.add((observation, RDF.type, self.namespaces["qb"]["Observation"])) self.graph.add((observation, self.namespaces["qb"]["dataSet"], self.namespaces["scope"][self.sheet_qname])) if self.isEmpty(i, j) and self.config.get("dataCell", "implicitZeros") == "1": self.graph.add((observation, self.namespaces["d2s"][self.dataCellPropertyName], Literal(0))) else: self.graph.add( (observation, self.namespaces["d2s"][self.dataCellPropertyName], Literal(self.source_cell.value)) ) # Use the row dimensions dictionary to find the properties that link data values to row headers try: for (dim_qname, properties) in self.row_dimensions[i]: for p in properties: self.graph.add((observation, self.namespaces["d2s"][p], dim_qname)) except KeyError: self.log.debug("({}.{}) No row dimension for cell".format(i, j)) # Use the column dimensions dictionary to find the objects of the d2s:dimension property try: for dim_qname in self.column_dimensions[j]: self.graph.add((observation, self.namespaces["d2s"]["dimension"], self.namespaces["scope"][dim_qname])) except KeyError: self.log.debug("({}.{}) No column dimension for cell".format(i, j)) def parseAnnotation(self, i, j): """ Create relevant triples for the annotation attached to cell (i, j) """ if self.config.get("annotations", "model") == "oa": # Create triples according to Open Annotation model body = BNode() self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], RDF.type, self.annotationNamespaces["oa"]["Annotation"], ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["hasBody"], body, ) ) self.annotationGraph.add( ( body, RDF.value, Literal( self.annotations[(i, j)] .text.replace("\n", " ") .replace("\r", " ") .replace("\r\n", " ") .encode("utf-8") ), ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["hasTarget"], self.namespaces["scope"][self.source_cell_qname], ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["annotator"], Literal(self.annotations[(i, j)].author.encode("utf-8")), ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["annotated"], Literal( datetime.datetime.fromtimestamp(os.path.getmtime(self.filename)).strftime("%Y-%m-%d"), datatype=self.annotationNamespaces["xsd"]["date"], ), ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["generator"], URIRef("https://github.com/Data2Semantics/TabLinker"), ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["generated"], Literal( datetime.datetime.now().strftime("%Y-%m-%d"), datatype=self.annotationNamespaces["xsd"]["date"] ), ) ) self.annotationGraph.add( ( self.annotationNamespaces["scope"][self.source_cell_qname], self.annotationNamespaces["oa"]["modelVersion"], URIRef("http://www.openannotation.org/spec/core/20120509.html"), ) ) else: # Create triples according to Nanopublications model print "Nanopublications not implemented yet!"
def graph_plan(plan, fountain, agp): def extract_cycle_roots(): c_roots = {} for c_id, c_node in described_cycles.items(): c_root_types = set({}) for crt in plan_graph.objects(c_node, AGORA.expectedType): crt_qname = plan_graph.qname(crt) c_root_types.update(_type_subtree(fountain, crt_qname)) c_roots[c_id] = c_root_types return c_roots def inc_tree_length(tree, l): if tree not in tree_lengths: tree_lengths[tree] = 0 tree_lengths[tree] += l def add_variable(p_node, vid, subject=True): sub_node = BNode(str(vid).replace('?', 'var_')) if subject: plan_graph.add((p_node, AGORA.subject, sub_node)) else: plan_graph.add((p_node, AGORA.object, sub_node)) plan_graph.set((sub_node, RDF.type, AGORA.Variable)) plan_graph.set((sub_node, RDFS.label, Literal(str(vid), datatype=XSD.string))) def describe_cycle(cycle_id, cg): c_node = BNode('cycle{}'.format(cycle_id)) cg = cg.get_context(c_node) cg.add((c_node, RDF.type, AGORA.Cycle)) previous_node = c_node c_steps = cycles[cycle_id] cycle_type = c_steps[0].get('type') for et in _type_subtree(fountain, cycle_type): cg.add((c_node, AGORA.expectedType, __extend_uri(prefixes, et))) for j, step in enumerate(c_steps): prop = step.get('property') b_node = BNode(previous_node.n3() + '/' + prop) cg.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop))) c_expected_type = step.get('type') cg.add((b_node, AGORA.expectedType, __extend_uri(prefixes, c_expected_type))) cg.add((previous_node, AGORA.next, b_node)) previous_node = b_node return c_node def is_extensible(node, node_patterns): extensible = True near_patterns = node_patterns.copy() for prev in tree_graph.subjects(AGORA.next, node): for sib_node in tree_graph.objects(prev, AGORA.next): if sib_node != res.n: near_patterns.update(set(tree_graph.objects(sib_node, AGORA.byPattern))) subjects = set() for p_node in near_patterns: p_subject = list(plan_graph.objects(p_node, AGORA.subject)).pop() if not isinstance(p_subject, URIRef): subject_str = list(plan_graph.objects(p_subject, RDFS.label)).pop().toPython() else: subject_str = str(p_subject) subjects.add(subject_str) if subjects and set.difference(subjects, roots): extensible = False return extensible def enrich_type_patterns(node_patterns): for p_node in node_patterns: p_pred = list(plan_graph.objects(p_node, AGORA.predicate)).pop() if p_pred == RDF.type: p_type = list(plan_graph.objects(p_node, AGORA.object)).pop() if isinstance(p_type, URIRef): for et in [et for et in expected_types if et == p_type]: q_expected_types = _type_subtree(fountain, tree_graph.qname(et)) for et_q in q_expected_types: tree_graph.add((res.n, AGORA.expectedType, __extend_uri(prefixes, et_q))) else: for et in expected_types: q_expected_types = _type_subtree(fountain, tree_graph.qname(et)) for et_q in q_expected_types: tree_graph.add((res.n, AGORA.expectedType, __extend_uri(prefixes, et_q))) def apply_cycle_extensions(c_roots, node_types): for c_id, root_types in c_roots.items(): found_extension = False for n, expected in node_types.items(): if set.intersection(set(root_types), set(expected)): tree_graph.add((n, AGORA.isCycleStartOf, described_cycles[c_id])) found_extension = True if not found_extension: plan_graph.remove_context(plan_graph.get_context(described_cycles[c_id])) def include_path(elm, p_seeds, p_steps, cycles, check): m = hashlib.md5() for s in p_seeds: m.update(s) elm_uri = __extend_uri(prefixes, elm) b_tree = BNode(m.digest().encode('base64').strip()) s_trees.add(b_tree) tree_graph.set((b_tree, RDF.type, AGORA.SearchTree)) tree_graph.add((b_tree, AGORA.fromType, elm_uri)) for seed in p_seeds: tree_graph.add((b_tree, AGORA.hasSeed, URIRef(seed))) for cycle_id in filter(lambda x: x not in described_cycles.keys(), cycles): c_node = describe_cycle(cycle_id, plan_graph) described_cycles[cycle_id] = c_node plan_graph.get_context(c_node).add((b_tree, AGORA.goesThroughCycle, c_node)) previous_node = b_tree inc_tree_length(b_tree, len(p_steps)) root_index = -1 pp = [] for j, step in enumerate(p_steps): prop = step.get('property') pp.append(prop) path_root = step.get('root', None) if path_root and root_index < 0: root_index = j base_id = path_root or b_tree base_id += '/' if j < len(p_steps) - 1 or (pattern[1] == RDF.type and isinstance(pattern[2], URIRef)): b_node = BNode(base_id + '/'.join(pp)) tree_graph.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop))) else: b_node = BNode(base_id + '/'.join(pp)) tree_graph.add((b_node, AGORA.expectedType, __extend_uri(prefixes, step.get('type')))) tree_graph.add((previous_node, AGORA.next, b_node)) previous_node = b_node p_node = _get_pattern_node(pattern, patterns) if pattern[1] == RDF.type and isinstance(pattern[2], URIRef): b_id = '{}_{}_{}'.format(pattern[0].n3(plan_graph.namespace_manager), pattern[1].n3(plan_graph.namespace_manager), pattern[2].n3(plan_graph.namespace_manager)) b_node = BNode(b_id) tree_graph.add((b_node, AGORA.expectedType, pattern[2])) tree_graph.add((previous_node, AGORA.next, b_node)) tree_graph.add((b_node, AGORA.byPattern, p_node)) if check: tree_graph.add((b_node, AGORA.checkType, Literal(check))) else: tree_graph.add((previous_node, AGORA.byPattern, p_node)) plan_graph = ConjunctiveGraph() plan_graph.bind('agora', AGORA) prefixes = plan.get('prefixes') ef_plan = plan.get('plan') tree_lengths = {} s_trees = set([]) patterns = {} described_cycles = {} for (prefix, u) in prefixes.items(): plan_graph.bind(prefix, u) tree_graph = plan_graph.get_context('trees') for i, tp_plan in enumerate(ef_plan): paths = tp_plan.get('paths') pattern = tp_plan.get('pattern') hints = tp_plan.get('hints') cycles = {} for c in tp_plan.get('cycles'): cid = str(c['cycle']) c_steps = c['steps'] cycles[cid] = c_steps if len(c_steps) > 1: cycles[cid + 'r'] = list(reversed(c_steps)) context = BNode('space_{}'.format(tp_plan.get('context'))) for path in paths: steps = path.get('steps') seeds = path.get('seeds') check = path.get('check', None) ty = None if not len(steps) and len(seeds): ty = pattern[2] elif len(steps): ty = steps[0].get('type') if ty: include_path(ty, seeds, steps, cycles, check) for t in s_trees: tree_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer))) pattern_node = _get_pattern_node(pattern, patterns) plan_graph.add((context, AGORA.definedBy, pattern_node)) plan_graph.set((context, RDF.type, AGORA.SearchSpace)) plan_graph.add((pattern_node, RDF.type, AGORA.TriplePattern)) plan_graph.add((pattern_node, RDFS.label, Literal(pattern_node.toPython()))) (sub, pred, obj) = pattern if isinstance(sub, BNode): add_variable(pattern_node, str(sub)) elif isinstance(sub, URIRef): plan_graph.add((pattern_node, AGORA.subject, sub)) if isinstance(obj, BNode): add_variable(pattern_node, str(obj), subject=False) elif isinstance(obj, Literal): node = BNode(str(obj).replace(' ', '').replace(':', '')) plan_graph.add((pattern_node, AGORA.object, node)) plan_graph.set((node, RDF.type, AGORA.Literal)) plan_graph.set((node, AGORA.value, obj)) else: plan_graph.add((pattern_node, AGORA.object, obj)) plan_graph.add((pattern_node, AGORA.predicate, pred)) if pred == RDF.type: if 'check' in hints: plan_graph.add((pattern_node, AGORA.checkType, Literal(hints['check'], datatype=XSD.boolean))) expected_res = tree_graph.query("""SELECT DISTINCT ?n WHERE { ?n agora:expectedType ?type }""") node_types = {} roots = set(_extract_roots(agp)) for res in expected_res: expected_types = list(tree_graph.objects(res.n, AGORA.expectedType)) q_expected_types = set(map(lambda x: tree_graph.qname(x), expected_types)) q_expected_types = filter( lambda x: not set.intersection(set(fountain.get_type(x)['super']), q_expected_types), q_expected_types) type_hierarchy = len(q_expected_types) == 1 tree_graph.add((res.n, AGORA.typeHierarchy, Literal(type_hierarchy))) direct_patterns = set(tree_graph.objects(res.n, AGORA.byPattern)) enrich_type_patterns(direct_patterns) if is_extensible(res.n, direct_patterns): node_types[res.n] = q_expected_types c_roots = extract_cycle_roots() apply_cycle_extensions(c_roots, node_types) for t in s_trees: tree_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer))) from_types = set([plan_graph.qname(x) for x in plan_graph.objects(t, AGORA.fromType)]) def_from_types = filter(lambda x: not set.intersection(set(fountain.get_type(x)['sub']), from_types), from_types) for dft in def_from_types: tree_graph.set((t, AGORA.fromType, __extend_uri(prefixes, dft))) for res in plan_graph.query("""SELECT ?tree ?sub ?nxt WHERE { ?tree a agora:SearchTree ; agora:next ?nxt . ?nxt agora:byPattern [ agora:subject ?sub ] }"""): if isinstance(res.sub, URIRef): plan_graph.set((res.tree, AGORA.hasSeed, res.sub)) plan_graph.remove((res.nxt, AGORA.isCycleStartOf, None)) _inform_on_inverses(plan_graph, fountain, prefixes) return plan_graph
def get_vocab_base(vocabfile): graph = Graph() try: graph.parse(vocabfile) except: graph = None graph = Graph() try: graph.parse(vocabfile, format="n3") except: return (None, None, None) identifier = None for v in graph.objects(None, namespaces['dc']['identifier']): identifier = v if not identifier: for v in graph.objects(None, namespaces['dcterms']['identifier']): identifier = v base = None if not base: for s in graph.subjects(namespaces['rdf']['type'], namespaces['owl']['Ontology']): base = s break if not base: for s in graph.subjects(namespaces['dc']['title'], None): base = s break if not base: for s in graph.subjects(namespaces['dcterms']['title'], None): base = s break if not base: for s in graph.subjects(namespaces['dc']['creator'], None): base = s break if not base: for s in graph.subjects(namespaces['dcterms']['creator'], None): base = s break if not base: for v in graph.objects(None, namespaces['vann']['preferredNamespaceUri']): base = v break if not base: for v in graph.namespaces(): if v[0] == '': base = v[1] break prefix = None vocab_prefixes = graph.objects(None, namespaces['vann']['preferredNamespacePrefix']) for vp in vocab_prefixes: prefix = vp if not prefix and base: for v in graph.namespaces(): if str(v[1]) == str(base): prefix = v[0] break if not prefix and base: prefix = base.strip().strip('/').split('/')[-1].strip('#').strip(' ') if base: base = base.strip() if (base[-1]!="/" and base[-1]!="#"): base += "#" return (identifier, base, prefix)
class Inspector(object): """ Class that includes methods for querying an RDFS/OWL ontology """ def __init__(self, uri, language=""): super(Inspector, self).__init__() self.rdfGraph = ConjunctiveGraph() try: self.rdfGraph.parse(uri, format="application/rdf+xml") except: try: self.rdfGraph.parse(uri, format="n3") except: raise exceptions.Error( "Could not parse the file! Is it a valid RDF/OWL ontology?" ) finally: self.baseURI = self.get_OntologyURI() or uri self.allclasses = self.__getAllClasses(includeDomainRange=True, includeImplicit=True, removeBlankNodes=False, excludeRDF_OWL=False) def get_OntologyURI(self, return_as_string=True): test = [ x for x, y, z in self.rdfGraph.triples((None, RDF.type, Ontology)) ] if test: if return_as_string: return str(test[0]) else: return test[0] else: return None def __getAllClasses(self, classPredicate="", includeDomainRange=False, includeImplicit=False, removeBlankNodes=True, addOWLThing=True, excludeRDF_OWL=True): rdfGraph = self.rdfGraph exit = {} def addIfYouCan(x, mydict): if excludeRDF_OWL: if x.startswith('http://www.w3.org/2002/07/owl#') or \ x.startswith("http://www.w3.org/1999/02/22-rdf-syntax-ns#") or \ x.startswith("http://www.w3.org/2000/01/rdf-schema#"): return mydict if x not in mydict: mydict[x] = None return mydict if addOWLThing: exit = addIfYouCan(Thing, exit) if classPredicate == "rdfs" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, RDFS.Class): exit = addIfYouCan(s, exit) if classPredicate == "owl" or classPredicate == "": for s in rdfGraph.subjects(RDF.type, Class): exit = addIfYouCan(s, exit) if includeDomainRange: for o in rdfGraph.objects(None, RDFS.domain): exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDFS.range): exit = addIfYouCan(o, exit) if includeImplicit: for s, v, o in rdfGraph.triples((None, RDFS.subClassOf, None)): exit = addIfYouCan(s, exit) exit = addIfYouCan(o, exit) for o in rdfGraph.objects(None, RDF.type): exit = addIfYouCan(o, exit) # get a list exit = exit.keys() if removeBlankNodes: exit = [x for x in exit if not isBlankNode(x)] return sort_uri_list_by_name(exit) def __getTopclasses(self, classPredicate=''): returnlist = [] for eachclass in self.__getAllClasses(classPredicate): x = self.get_classDirectSupers(eachclass) if not x: returnlist.append(eachclass) return sort_uri_list_by_name(returnlist) def __getTree(self, father=None, out=None): if not father: out = {} topclasses = self.toplayer out[0] = topclasses for top in topclasses: children = self.get_classDirectSubs(top) out[top] = children for potentialfather in children: self.__getTree(potentialfather, out) return out else: children = self.get_classDirectSubs(father) out[father] = children for ch in children: self.__getTree(ch, out) def __buildClassTree(self, father=None, out=None): if not father: out = {} topclasses = self.toplayer out[0] = [Thing] out[Thing] = sort_uri_list_by_name(topclasses) for top in topclasses: children = self.get_classDirectSubs(top) out[top] = sort_uri_list_by_name(children) for potentialfather in children: self.__buildClassTree(potentialfather, out) return out else: children = self.get_classDirectSubs(father) out[father] = sort_uri_list_by_name(children) for ch in children: self.__buildClassTree(ch, out) # methods for getting ancestores and descendants of classes: by default, we do not include blank nodes def get_classDirectSupers(self, aClass, excludeBnodes=True, sortUriName=False): returnlist = [] for o in self.rdfGraph.objects(aClass, RDFS.subClassOf): if not (o == Thing): if excludeBnodes: if not isBlankNode(o): returnlist.append(o) else: returnlist.append(o) if sortUriName: return sort_uri_list_by_name(remove_duplicates(returnlist)) else: return remove_duplicates(returnlist) def get_classDirectSubs(self, aClass, excludeBnodes=True): returnlist = [] for s, v, o in self.rdfGraph.triples((None, RDFS.subClassOf, aClass)): if excludeBnodes: if not isBlankNode(s): returnlist.append(s) else: returnlist.append(s) return sort_uri_list_by_name(remove_duplicates(returnlist)) def get_classSiblings(self, aClass, excludeBnodes=True): returnlist = [] for father in self.get_classDirectSupers(aClass, excludeBnodes): for child in self.get_classDirectSubs(father, excludeBnodes): if child != aClass: returnlist.append(child) return sort_uri_list_by_name(remove_duplicates(returnlist)) def entitySynonyms(self, anEntity, language=DEFAULT_LANGUAGE, getall=True): if getall: temp = [] # Uberon synonyms for o in self.rdfGraph.objects(anEntity, Synonym): temp += [o] # EFO synonyms for o in self.rdfGraph.objects(anEntity, Synonym): temp += [o] # OBI synonyms for o in self.rdfGraph.objects(anEntity, OBO_Synonym): temp += [o] return temp else: for o in self.rdfGraph.objects(anEntity, Synonym): if getattr(o, 'language') and getattr(o, 'language') == language: return o return "" def classFind(self, name, exact=False): temp = [] if name: for x in self.allclasses: if exact: if x.__str__().lower() == str(name).lower(): return [x] else: if x.__str__().lower().find(str(name).lower()) >= 0: temp.append(x) return temp
def graph_plan(plan, fountain): plan_graph = ConjunctiveGraph() plan_graph.bind('agora', AGORA) prefixes = plan.get('prefixes') ef_plan = plan.get('plan') tree_lengths = {} s_trees = set([]) patterns = {} for (prefix, u) in prefixes.items(): plan_graph.bind(prefix, u) def __get_pattern_node(p): if p not in patterns: patterns[p] = BNode('tp_{}'.format(len(patterns))) return patterns[p] def __inc_tree_length(tree, l): if tree not in tree_lengths: tree_lengths[tree] = 0 tree_lengths[tree] += l def __add_variable(p_node, vid, subject=True): sub_node = BNode(str(vid).replace('?', 'var_')) if subject: plan_graph.add((p_node, AGORA.subject, sub_node)) else: plan_graph.add((p_node, AGORA.object, sub_node)) plan_graph.set((sub_node, RDF.type, AGORA.Variable)) plan_graph.set((sub_node, RDFS.label, Literal(str(vid), datatype=XSD.string))) def include_path(elm, p_seeds, p_steps): elm_uri = __extend_uri(prefixes, elm) path_g = plan_graph.get_context(elm_uri) b_tree = BNode(elm_uri) s_trees.add(b_tree) path_g.set((b_tree, RDF.type, AGORA.SearchTree)) path_g.set((b_tree, AGORA.fromType, elm_uri)) for seed in p_seeds: path_g.add((b_tree, AGORA.hasSeed, URIRef(seed))) previous_node = b_tree __inc_tree_length(b_tree, len(p_steps)) for j, step in enumerate(p_steps): prop = step.get('property') b_node = BNode(previous_node.n3() + prop) if j < len(p_steps) - 1 or pattern[1] == RDF.type: path_g.add((b_node, AGORA.onProperty, __extend_uri(prefixes, prop))) path_g.add((b_node, AGORA.expectedType, __extend_uri(prefixes, step.get('type')))) path_g.add((previous_node, AGORA.next, b_node)) previous_node = b_node p_node = __get_pattern_node(pattern) path_g.add((previous_node, AGORA.byPattern, p_node)) for i, tp_plan in enumerate(ef_plan): paths = tp_plan.get('paths') pattern = tp_plan.get('pattern') hints = tp_plan.get('hints') context = BNode('space_{}'.format(tp_plan.get('context'))) for path in paths: steps = path.get('steps') seeds = path.get('seeds') if not len(steps) and len(seeds): include_path(pattern[2], seeds, steps) elif len(steps): ty = steps[0].get('type') include_path(ty, seeds, steps) for t in s_trees: plan_graph.set((t, AGORA.length, Literal(tree_lengths.get(t, 0), datatype=XSD.integer))) pattern_node = __get_pattern_node(pattern) plan_graph.add((context, AGORA.definedBy, pattern_node)) plan_graph.set((context, RDF.type, AGORA.SearchSpace)) plan_graph.add((pattern_node, RDF.type, AGORA.TriplePattern)) (sub, pred, obj) = pattern if isinstance(sub, BNode): __add_variable(pattern_node, str(sub)) elif isinstance(sub, URIRef): plan_graph.add((pattern_node, AGORA.subject, sub)) if isinstance(obj, BNode): __add_variable(pattern_node, str(obj), subject=False) elif isinstance(obj, Literal): node = BNode(str(obj).replace(' ', '')) plan_graph.add((pattern_node, AGORA.object, node)) plan_graph.set((node, RDF.type, AGORA.Literal)) plan_graph.set((node, AGORA.value, Literal(str(obj), datatype=XSD.string))) else: plan_graph.add((pattern_node, AGORA.object, obj)) plan_graph.add((pattern_node, AGORA.predicate, pred)) if pred == RDF.type: if 'check' in hints: plan_graph.add((pattern_node, AGORA.checkType, Literal(hints['check'], datatype=XSD.boolean))) sub_expected = plan_graph.subjects(predicate=AGORA.expectedType) for s in sub_expected: expected_types = list(plan_graph.objects(s, AGORA.expectedType)) for et in expected_types: plan_graph.remove((s, AGORA.expectedType, et)) q_expected_types = [plan_graph.qname(t) for t in expected_types] expected_types = [d for d in expected_types if not set.intersection(set(fountain.get_type(plan_graph.qname(d)).get('super')), set(q_expected_types))] for et in expected_types: plan_graph.add((s, AGORA.expectedType, et)) return plan_graph
def retrieve(request, graph): try : cg = ConjunctiveGraph().parse(data=graph, format='n3') except : return not_turtle_response(graph) DRUG = Namespace('http://aers.data2semantics.org/resource/drug/') PO = Namespace('http://www.data2semantics.org/ontology/patient/') UMLS = Namespace('http://linkedlifedata.com/resource/umls/id/') LS = Namespace('http://linkedlifedata.com/resource/lifeskim/') cg.bind('drug',DRUG) cg.bind('po',PO) cg.bind('umls',UMLS) cg.bind('lifeskim',LS) try : patient = cg.value(predicate=RDF.type, object=PO['Patient'], any=False) except: # More than one patient return multiple_patients_response(cg.serialize(format='turtle')) if (cg.value(predicate=PO['hasIndication'],object=UMLS['C0027947']) and cg.value(predicate=PO['hasMeasurement'],object=UMLS['C0015967'])) : # We now know the patient has Febrile Neutropenia cg.add((patient,PO['hasIndication'],UMLS['C0746883'])) aers_sparql = SPARQLWrapper("http://eculture2.cs.vu.nl:5020/sparql/") aers_sparql.setReturnFormat(JSON) lld_sparql = SPARQLWrapper("http://linkedlifedata.com/sparql") lld_sparql.setReturnFormat(JSON) ranking = Counter() # Chain generators for all values for the attributes of the patient features = itertools.chain(cg.objects(subject=patient, predicate=PO['hasIndication']), \ cg.objects(subject=patient, predicate=PO['hasMeasurement']), \ cg.objects(subject=patient, predicate=PO['usesMedication']), \ cg.objects(subject=patient, predicate=PO['hadPreviousIndication']), \ cg.objects(subject=patient, predicate=PO['hadRecentTreatment'])) exp_features = set() q_part = "" # First get all sameAs uris for the values for f in features : if str(f).startswith('http://linkedlifedata.com'): exp_features.add(str(f)) q_part += "{?altname owl:sameAs <"+f+"> .} UNION { <"+f+"> owl:sameAs ?altname .} UNION \n" q_part = q_part[:-8] q = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX owl: <http://www.w3.org/2002/07/owl#> SELECT ?altname WHERE { """ + q_part + """ } """ aers_sparql.setQuery(q) results = aers_sparql.query().convert() # Only query LLD for stuff that LLD knows about (saves quite some time) for result in results["results"]["bindings"]: if result["altname"]["value"].startswith('http://linkedlifedata.com') : exp_features.add(result["altname"]["value"]) # Then lookup the publications that mention these, and add them to a tally (Counter) for ef in exp_features : q = """ PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> PREFIX lifeskim: <http://linkedlifedata.com/resource/lifeskim/> SELECT ?pubmed WHERE { ?pubmed lifeskim:mentions <"""+ef+"""> . } LIMIT 250 """ lld_sparql.setQuery(q) results = lld_sparql.query().convert() for result in results["results"]["bindings"]: ranking[result["pubmed"]["value"]] += 1 # Return only the 20 most frequent publications ranking_json = json.dumps(ranking.most_common(50)) # print ranking_json return HttpResponse(ranking_json, mimetype='application/json')
print("creating mapping ...") res = es_con.indices.put_mapping(index=index_name, doc_type="doc", body=thesaurus_mapping) print("resonse: {}".format(res)) i = 0 for uri in graph.query(querystring): this_uri = uri[0] doc = {"uri": this_uri} j = 0 for lang in ['ar', 'zh', 'en', 'fr', 'ru', 'es']: pref_labels = [] for label in graph.preferredLabel(URIRef(this_uri), lang): pref_labels.append(label[1]) doc.update({"labels_{}".format(lang): pref_labels}) alt_labels = [] for label in graph.objects(URIRef(this_uri), SKOS.altLabel): if label.language == lang: alt_labels.append(label) doc.update({"alt_labels_{}".format(lang): alt_labels}) payload = json.dumps(doc) res = es_con.index(index=index_name, doc_type='doc', body=payload) doc = {"uri": this_uri} j += 1 i += j if i % 50 == 0: print("{} fields indexed".format(i))
class PreProcessor(object): def __init__(self, kg_path): self.kg_path = kg_path self.ent_dict = dict() self.rel_dict = dict() self.g = ConjunctiveGraph() self.unique_msgs = self.ent_dict.copy() def load_knowledge_graph(self, format='xml', exclude_rels=[], clean_schema=True, amberg_params=None, excluded_entities=None): self.g.load(self.kg_path, format=format) # remove triples with excluded relation remove_rel_triples(self.g, exclude_rels) # remove triples with relations between class-level constructs if clean_schema: remove_rel_triples(self.g, schema_relations) if excluded_entities is not None: remove_ent_triples(self.g, excluded_entities) if amberg_params: path_to_events = amberg_params[0] max_events = amberg_params[1] self.merged = get_merged_dataframe(path_to_events, max_events) self.unique_msgs, unique_vars, unique_mods, unique_fes = get_unique_entities( self.merged) update_amberg_ontology(self.g, self.ent_dict, self.unique_msgs, unique_mods, unique_fes, unique_vars, self.merged) self.update_entity_relation_dictionaries() def update_entity_relation_dictionaries(self): """ Given an existing entity dictionary, update it to *ontology* :param ontology: :param ent_dict: the existing entity dictionary :return: """ ent_counter = 0 fixed_ids = set([id for id in self.ent_dict.values()]) # sorting ensures equal random splits on equal seeds for h in sorted( set(self.g.subjects(None, None)).union( set(self.g.objects(None, None)))): uni_h = unicode(h) if uni_h not in self.ent_dict: while ent_counter in fixed_ids: ent_counter += 1 self.ent_dict.setdefault(uni_h, ent_counter) ent_counter += 1 # add new relations to dict for r in sorted(set(self.g.predicates(None, None))): uni_r = unicode(r) if uni_r not in self.rel_dict: self.rel_dict.setdefault(uni_r, len(self.rel_dict)) def load_unique_msgs_from_txt(self, path, max_events=None): """ Assuming csv text files with two columns :param path: :return: """ with open(path, "rb") as f: for line in f: split = line.split(',') try: emb_id = int(split[1].strip()) except: print("Error reading id of {0} in given dictionary".format( line)) # skip this event entitiy, treat it as common entitiy later on continue self.ent_dict[split[0]] = emb_id # sort ascending w.r.t. embedding id, in case of later stripping # self.ent_dict = sorted(self.ent_dict.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = self.ent_dict.copy() if max_events is not None: all_msgs = sorted(self.unique_msgs.items(), key=operator.itemgetter(1), reverse=False) self.unique_msgs = dict(all_msgs[:max_events]) excluded_events = dict(all_msgs[max_events:]).keys() return excluded_events def prepare_sequences(self, path_to_input, use_dict=True): """ Dumps pickle for sequences and dictionary :param data_frame: :param file_name: :param index: :param classification_event: :return: """ print("Preparing sequential data...") with open(path_to_input, "rb") as f: result = [] for line in f: entities = line.split(',') if use_dict: result.append([ int(e.strip()) for e in entities if int(e.strip()) in self.unique_msgs.values() ]) else: result.append([int(e.strip()) for e in entities]) print("Processed {0} sequences".format(len(result))) return result def get_vocab_size(self): return len(self.unique_msgs) def get_ent_dict(self): return self.ent_dict def get_rel_dict(self): return self.rel_dict def get_kg(self): return self.g def get_unique_msgs(self): return self.unique_msgs def get_merged(self): return self.merged
class KB4ITGraph: """ This class creates a RDF graph based on attributes for each doc. Also it has convenient function to ask the graph """ def __init__(self, path=None): """ If not path is passed it build a graph in memory. Otherwise, it creates a persistent graph in disk. """ if path is not None: # Create persistent Graph in disk self.path = path self.graph = ConjunctiveGraph('Sleepycat', URIRef("kb4it://")) graph_path = path + SEP + 'kb4it.graph' self.graph.store.open(graph_path) else: # Create Graph in Memory self.graph = ConjunctiveGraph('IOMemory') # Assign namespaces to the Namespace Manager of this graph namespace_manager = NamespaceManager(ConjunctiveGraph()) for ns in NSBINDINGS: namespace_manager.bind(ns, NSBINDINGS[ns]) self.graph.namespace_manager = namespace_manager def __uniq_sort(self, result): alist = list(result) aset = set(alist) alist = list(aset) alist.sort() return alist def subjects(self, predicate, object): """ Returns a list of sorted and uniques subjects given a predicate and an object. """ return self.__uniq_sort(self.graph.subjects(predicate, object)) def predicates(self, subject=None, object=None): """ Returns a list of sorted and uniques predicates given a subject and an object. """ return self.__uniq_sort(self.graph.predicates(subject, object)) def objects(self, subject, predicate): """ Returns a list of sorted and uniques objects given a subject and an predicate. """ return self.__uniq_sort(self.graph.objects(subject, predicate)) def value(self, subject=None, predicate=None, object=None, default=None, any=True): """ Returns a value given the subject and the predicate. """ return self.graph.value(subject, predicate, object, default, any) def add_document(self, doc): """ Add a new document to the graph. """ subject = URIRef(doc) predicate = RDF['type'] object = URIRef(KB4IT['Document']) self.graph.add([subject, predicate, object]) def add_document_attribute(self, doc, attribute, value): """ Add a new attribute to a document """ predicate = 'has%s' % attribute subject = URIRef(doc) predicate = KB4IT[predicate] object = Literal(value) self.graph.add([subject, predicate, object]) def get_attributes(self): """ Get all predicates except RFD.type and Title """ blacklist = set() blacklist.add(RDF['type']) blacklist.add(KB4IT['hasTitle']) alist = list(self.graph.predicates(None, None)) aset = set(alist) - blacklist alist = list(aset) alist.sort() return alist def serialize(self): """ Serialize graph to pretty xml format """ return self.graph.serialize(format='pretty-xml') def close(self): """ Close the graph if it is persistent. FIXME: check if it is open """ self.graph.store.close()