def getDBpediaN3(self): """ test if DBpedia ntriples retrieval is working :return: """ results = get_curling( 'http://hypermedia.projectchronos.eu/sparql', { 'query': 'SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><http://ontology.projectchronos.eu/astronomy/Planet> . }' }) results = json.loads(results) print results['results']['bindings'] urls = [r['planets']['value'] for r in results['results']['bindings']] print urls[0] def get_link(url): rdf = get_curling(url, {'format': 'jsonld'}) rdf = json.loads(rdf) sameas = rdf['owl:sameAs'] return sameas from flankers.extCaching import dbpedia_url n3s = dict() for u in urls: l = get_link(u) j = dbpedia_url(l) n3s[j] = get_curling(j) return n3s
def getDBpediaN3(self): """ test if DBpedia ntriples retrieval is working :return: """ results = get_curling('http://hypermedia.projectchronos.eu/sparql', { 'query': 'SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type><http://ontology.projectchronos.eu/astronomy/Planet> . }' }) results = json.loads(results) print results['results']['bindings'] urls = [r['planets']['value'] for r in results['results']['bindings']] print urls[0] def get_link(url): rdf = get_curling(url, {'format': 'jsonld'}) rdf = json.loads(rdf) sameas = rdf['owl:sameAs'] return sameas from flankers.extCaching import dbpedia_url n3s = dict() for u in urls: l = get_link(u) j = dbpedia_url(l) n3s[j] = get_curling(j) return n3s
def test_articles_api_type_view(self): """ Test the Articles JSON API: /articles/<version>/?type_of= """ _VERSION = "v04" print "Running test_articles TYPE_OF" import urllib env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/by?type=feed" first = get_curling(base_url) first = test_integrity(first) bookmark = first['next'] print bookmark for i in range(0, 600): # higher the second element of the interval to test more pages print i if bookmark: count_ = 0 response = urllib.urlopen(bookmark).read() response = test_integrity(response) for a in response['articles']: # print a['uuid'] count_ += 1 bookmark = response['next'] print count_, i, bookmark else: print 'Articles by_type finished' return None
def test_json(self): """ Test content of the /database/cots/ endpoint and if contained urls are reachable :param env: 'offline' for localhost, 'online' for remote """ print "Running test_json" import urllib env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/database/cots/" response = get_curling(base_url) response = json.loads(response) props = [ "go_to_collection", "collection_ld+json_description", "collection_n-triples_description", "name" ] names = [f.split('_')[1] for f in families] for r in response: assert all(True if rp in props else False for rp in [k for k in r.keys()]) print "Testing urls in the response. Wait..." assert all( urllib.urlopen(v).getcode() == 200 or urllib.urlopen(v). getcode() == 301 if k != 'name' else v in names for k, v in r.items())
def create_concepts_triples(self, uuid): """ Fetch concepts related to a webresource and create triples :param uuid: the unique id of the webresource :return: """ from rdflib import URIRef concepts = get_curling( _ENV[self.test_env]['_SERVICE'] + '/datastore/concepts', { 'retrieve': uuid, 'token': _CLIENT_TOKEN } ) concepts = json.loads(concepts)['concepts'] triples = [] if concepts: for c in concepts: subject = _SERVICE + '/data/webresource/' + str(uuid) subject = URIRef(subject) predicate = URIRef('http://ontology.projectchronos.eu/chronos/relConcept') robject = _SERVICE + '/data/concept/' + c robject = URIRef(robject) triples.append((subject, predicate, robject)) return triples
def download_ids_generator(self, results=(), bookmark='start', environment='offline'): """ Recursive - Fetch and collect all the resources' ids in the datastore :param results: list of the collected ids :param bookmark: bookmark to fetch different datastore's pages :return: results list # USAGE iterated = download_ids_generator(environment='offline') for uuid in iterated: print uuid next(iterated) """ import itertools to_append = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index', {'token': _CLIENT_TOKEN, 'bookmark': bookmark if bookmark != 'start' else ''}) to_append = json.loads(to_append) if not to_append['next']: return itertools.chain(results, iter(to_append['articles'])) return self.download_ids_generator( results=itertools.chain(results, iter(to_append['articles'])), bookmark=to_append['next'] )
def fetch_and_dump_webresources(self, bookmark='start'): """ Recursive - Fetch all the resources' ids in the datastore and dump it into the triple store. Dump also the related keywords/concept in the datastore. :param bookmark: bookmark to fetch different datastore's pages :return: None """ print "Fetching page: " + bookmark to_append = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index', {'token': _CLIENT_TOKEN, 'bookmark': bookmark if bookmark != 'start' else ''}) to_append = json.loads(to_append) shard_url = _ENV[self.test_env]['_SERVICE'] + '/sparql' # dump the single WebResource in the dedicated graph self.dump_webresources_to_graph(list_of_ids=to_append['articles'], url=shard_url) # dump the concepts related to this webresource in the dedicated graph self.dump_concepts_to_graph(list_of_ids=to_append['articles'], url=shard_url) if not to_append['next']: return None return self.fetch_and_dump_webresources( bookmark=to_append['next'] )
def set_typeof_and_ingraph_properties(self, bookmark='start'): print "Fetching page: " + bookmark articles = get_curling(_ENV[self.test_env]['_SERVICE'] + '/datastore/index', {'token': _CLIENT_TOKEN, 'bookmark': bookmark if bookmark != 'start' else ''}) articles = json.loads(articles) # fetch single article for a in articles['articles']: res = get_curling( _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource', { 'retrieve': a, 'token': _CLIENT_TOKEN } ) res = json.loads(res) if not res['type_of'] in res.keys(): try: int(res['title']) update = {'type_of': 'tweet', 'in_graph': False } except Exception: if res['title'] == '': if res['url'].endswith(('jpg', 'jpeg', 'png', 'mp3', 'mp4')): update = {'type_of': 'media', 'in_graph': False } else: update = {'type_of': 'link', 'in_graph': False } else: update = {'type_of': 'feed', 'in_graph': False } print update post_curling( _ENV[self.test_env]['_SERVICE'] + '/datastore/webresource', { 'token': _CLIENT_TOKEN, 'update': a, 'properties': json.dumps(update) }, display=True ) if not articles['next']: return None return self.fetch_and_dump_webresources( bookmark=articles['next'] )
def test_indexer_base(self): _VERSION = "v04" print "Running test_indexer_base" env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/articles/" + _VERSION + "/indexer" response = get_curling(base_url) test_integrity(response) print "Counted keywords: " + str(json.loads(response)['n_indexed'])
def test_sparql(self): """ Test content of the /database/cots/ endpoint and if contained urls are reachable :param env: 'offline' for localhost, 'online' for remote """ print "Running test_sparql" env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/sparql" # queries = ["SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> ?planets. }", "SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ontology.projectchronos.eu/astronomy/TerrestrialPlanet> . }", "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> <http://ontology.projectchronos.eu/solarsystem/Saturn>. }"] responses = [get_curling(base_url, {'query': q}) for q in queries] for i, r in enumerate(responses): print i, r test_integrity(r)
def test_sparql(self): """ Test content of the /database/cots/ endpoint and if contained urls are reachable :param env: 'offline' for localhost, 'online' for remote """ print "Running test_sparql" env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/sparql" # queries = [ "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> ?planets. }", "SELECT * WHERE { ?planets <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://ontology.projectchronos.eu/astronomy/TerrestrialPlanet> . }", "SELECT * WHERE { ?satellites <http://ontology.projectchronos.eu/astronomy/orbitsPlanet> <http://ontology.projectchronos.eu/solarsystem/Saturn>. }" ] responses = [get_curling(base_url, {'query': q}) for q in queries] for i, r in enumerate(responses): print i, r test_integrity(r)
def test_articles(self): """ Test the NL API: /visualize/articles/?api=true """ print "Running test_articles" import urllib env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/visualize/articles/" first = get_curling(base_url, {'api': 'true'}) first = test_integrity(first) bookmark = first['next'] print bookmark for i in range(0, 5): print i response = urllib.urlopen(bookmark).read() response = test_integrity(response) bookmark = response['next'] print i, bookmark
def test_json(self): """ Test content of the /database/cots/ endpoint and if contained urls are reachable :param env: 'offline' for localhost, 'online' for remote """ print "Running test_json" import urllib env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/database/cots/" response = get_curling(base_url) response = json.loads(response) props = ["go_to_collection", "collection_ld+json_description", "collection_n-triples_description", "name"] names = [f.split('_')[1] for f in families] for r in response: assert all(True if rp in props else False for rp in [k for k in r.keys()]) print "Testing urls in the response. Wait..." assert all(urllib.urlopen(v).getcode() == 200 or urllib.urlopen(v).getcode() == 301 if k != 'name' else v in names for k, v in r.items())
def test_articles(self): """ Test the NL API: /articles/?api=true """ print "Running test_articles" import urllib env = self.test_env base_url = _ENV[env]['_SERVICE'] + "/articles/" first = get_curling(base_url, {'api': 'true'}) first = test_integrity(first) bookmark = first['next'] print bookmark for i in range(0, 5): print i response = urllib.urlopen(bookmark).read() response = test_integrity(response) bookmark = response['next'] print i, bookmark
def test_crawl_local(self): url = "http://localhost:8080/cron/startcrawling" from scripts.remote.remote import get_curling res = get_curling(url) print res
def get_link(url): rdf = get_curling(url, {'format': 'jsonld'}) rdf = json.loads(rdf) sameas = rdf['owl:sameAs'] return sameas