def test_query_ok(self): from webui.cnmain.utils import get_virtuoso graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped'] get_virtuoso('master').ingest(self._get_test_file( 'boardgamegeek-games-mapped.nt', 'scheduler'), graph=graph_pref + 'test_graph') response = self._test_query(""" def nodes() { return g.V('type', 'sd:BoardGame').id.collect{it} } def slice(nodes_id) { m = [] nodes_id.each{ node_id -> g.v(node_id).transform{ node -> data = [acheneID: node['sd:acheneID']] data.provenance = node.out('bristle').out('source').name.collect{ it}.join(',') return data }.fill(m) } return m } """) self.assertEqual(response.status_code, 200)
def test_query_ok(self): from webui.cnmain.utils import get_virtuoso graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped'] get_virtuoso('master').ingest( self._get_test_file('boardgamegeek-games-mapped.nt', 'scheduler'), graph=graph_pref + 'test_graph' ) response = self._test_query(""" def nodes() { return g.V('type', 'sd:BoardGame').id.collect{it} } def slice(nodes_id) { m = [] nodes_id.each{ node_id -> g.v(node_id).transform{ node -> data = [acheneID: node['sd:acheneID']] data.provenance = node.out('bristle').out('source').name.collect{ it}.join(',') return data }.fill(m) } return m } """) self.assertEqual(response.status_code, 200)
def _clear_graphs(): from webui.cnmain.utils import get_virtuoso get_virtuoso('default').clear_regex( settings.TRIPLE_DATABASE['PREFIXES']['graph']) get_virtuoso('master').clear_regex( settings.TRIPLE_DATABASE['PREFIXES']['graph'])
def _clear_graphs(): from webui.cnmain.utils import get_virtuoso get_virtuoso('default').clear_regex( settings.TRIPLE_DATABASE['PREFIXES']['graph'] ) get_virtuoso('master').clear_regex( settings.TRIPLE_DATABASE['PREFIXES']['graph'] )
def test_source_with_refine_rdf_rule(self): source = Source.objects.get(name='BoardGameTournament (test)') process_source.delay(source) path = self._get_test_file( "boardgametournament_refine_rules.json", "cnmain" ) with open(path) as f: rule = f.read() dataset = source.datasets.get(name="boardgametournament-games") archive_item = dataset.archive_items.get() archive_item.rule = RuleFactory( rule=rule, hash=archive_item.file_hash ) archive_item.save(force_update=True) process_source.delay(source) from webui.cnmain.utils import get_virtuoso virtuoso = get_virtuoso() row_id = archive_item.datagraph_mapped_row_id("0") self._assert_description(virtuoso, row_id, [ ("http://ontologies.venturi.eu/v1#name", "Dominion"), ]) row_id = archive_item.datagraph_mapped_row_id("1") self._assert_description(virtuoso, row_id, [ ("http://ontologies.venturi.eu/v1#name", "Carcassonne"), ])
def test_source_scraperwiki(self): Scheduler.objects.all().delete() ArchiveItem.objects.all().delete() source = Source.objects.get(name='trentinocultura') process_source.delay(source) dataset = source.datasets.get() archive_item = source.datasets.get().archive_items.get() self._assert_archive_item( archive_item, (u'category', u'city', u'title', u'url', u'price', u'hours', u'website', u'phone', u'location', u'address', u'date', u'notes', u'email', u'organizer', u'other_info', u'fax'), 49 ) from webui.cnmain.utils import get_virtuoso virtuoso = get_virtuoso() source_meta_id = source.metagraph_resource_id dataset_meta_id = dataset.metagraph_resource_id from rdflib import Namespace METAPROP = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['meta']) SDOWL = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['sdowl']) RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' self._assert_description(virtuoso, source_meta_id, [ (METAPROP['description'], source.description), (RDF_TYPE, SDOWL['Source'], 'iri'), ]) self._assert_description(virtuoso, dataset_meta_id, [ (METAPROP['download'], dataset.download), (RDF_TYPE, SDOWL['Dataset'], 'iri'), (SDOWL['belongs_to_source'], source_meta_id, 'iri'), ])
def test_query_ok(self): graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped'] get_virtuoso('master').ingest(self._get_test_file( 'boardgamegeek-games-mapped.nt', 'scheduler'), graph=graph_pref + 'test_graph') results = get_cleaned_sliced_data(query=THE_QUERY, fields='acheneID,provenance', with_header=True) header = next(results) self.assertIsInstance(header, list) self.assertGreater(len(list(results)), 0) results = get_sliced_data(query=THE_QUERY, fields='acheneID,provenance', with_header=False) results = list(results) self.assertIsInstance(results[0], dict)
def handle(self, *args, **options): """ entry point """ for instance in ('default', 'master'): print "Installing on virtuoso", instance virtuoso = get_virtuoso(instance) virtuoso.install_extensions() print
def main(args): """ the body of the script """ tmpdir = mkdtemp() if args.file.startswith(('http://', 'https://')): print "The file is in the net, downloading it..." file_basename = os.path.basename( urllib2.urlparse.urlsplit(args.file).path ) result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename)) if result.status_code: print_error_result( result, "Error while downloading RDF data {}. Aborting".format( args.file ) ) exit(1) filename = os.path.join( tmpdir, file_basename ) else: print "The file is local, moving it..." shutil.copy(args.file, tmpdir) filename = os.path.join(tmpdir, os.path.basename(args.file)) print "handling file", filename filename_cropped, extension = os.path.splitext(filename) if extension == '.bz2': print "Got a bz2 file, need to convert it with gzip" gzip_filename = filename_cropped + '.gz' result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format( filename, gzip_filename )) if result.status_code: print_error_result(result, "Error while converting file, aborting") exit(2) filename = gzip_filename print "File converted successfully, now handling", filename print "Ingesting file in virtuoso" virtuoso = get_virtuoso() virtuoso.clear(args.graph) print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
def test_query_ok(self): graph_pref = settings.TRIPLE_DATABASE['PREFIXES']['data_graph_mapped'] get_virtuoso('master').ingest( self._get_test_file('boardgamegeek-games-mapped.nt', 'scheduler'), graph=graph_pref + 'test_graph' ) results = get_cleaned_sliced_data( query=THE_QUERY, fields='acheneID,provenance', with_header=True ) header = next(results) self.assertIsInstance(header, list) self.assertGreater(len(list(results)), 0) results = get_sliced_data( query=THE_QUERY, fields='acheneID,provenance', with_header=False ) results = list(results) self.assertIsInstance(results[0], dict)
def handle(self, *args, **options): """ Resets the virtuoso graph for this project. """ from webui.cnmain.utils import get_virtuoso got_graph_settings = self.get_graph_settings(*args, **options) if not got_graph_settings: raise CommandError("The --router option is mandatory") return virtuoso = get_virtuoso(self.router) cleared = virtuoso.clear_regex(r'.*') print "Cleared {} graphs".format(cleared)
def refresh_sources(source_id=None): """ generate a .trig file for the source, and ingest it into virtuoso """ sources = [Source.objects.get(pk=source_id)] \ if source_id else Source.objects.all() filename = 'source-{}.trig'.format(source_id if source_id else 'all') n_triples = 0 clear_graphs = [] with closing(TrigFile(filename)) as trig: meta_graph = trig.add_graph(PREFIXES['meta_graph']) for source in sources: # add triples for source metadata for quad in source_meta_quads(source): meta_graph.add_triple(quad) n_triples += 1 for dataset in source.datasets.all(): # add triples for dataset metadata for quad in dataset_meta_quads(dataset): meta_graph.add_triple(quad) n_triples += 1 for archive_item in dataset.archive_items.all(): # add triples for archive_item metadata for quad in archive_item_meta_quads(archive_item): meta_graph.add_triple(quad) n_triples += 1 data_graph = trig.add_graph( archive_item.datagraph_raw_name ) clear_graphs.append(data_graph.name) # add triples for archive item for quad in archive_item_data_quads(archive_item): data_graph.add_triple(quad) n_triples += 1 from webui.cnmain.utils import get_virtuoso virtuoso = get_virtuoso() logger.debug('ingesting {} into virtuoso'.format(filename)) virtuoso.clear(clear_graphs) virtuoso.ingest(filename) return n_triples
def main(args): """ the body of the script """ tmpdir = mkdtemp() if args.file.startswith(('http://', 'https://')): print "The file is in the net, downloading it..." file_basename = os.path.basename( urllib2.urlparse.urlsplit(args.file).path) result = envoy.run('wget "{}" -O {}'.format(args.file, file_basename)) if result.status_code: print_error_result( result, "Error while downloading RDF data {}. Aborting".format( args.file)) exit(1) filename = os.path.join(tmpdir, file_basename) else: print "The file is local, moving it..." shutil.copy(args.file, tmpdir) filename = os.path.join(tmpdir, os.path.basename(args.file)) print "handling file", filename filename_cropped, extension = os.path.splitext(filename) if extension == '.bz2': print "Got a bz2 file, need to convert it with gzip" gzip_filename = filename_cropped + '.gz' result = envoy.run('bunzip2 "{}" -c | gzip > "{}'.format( filename, gzip_filename)) if result.status_code: print_error_result(result, "Error while converting file, aborting") exit(2) filename = gzip_filename print "File converted successfully, now handling", filename print "Ingesting file in virtuoso" virtuoso = get_virtuoso() virtuoso.clear(args.graph) print "Ingestion completed", virtuoso.ingest(filename, graph=args.graph)
def refresh_sources(source_id=None): """ generate a .trig file for the source, and ingest it into virtuoso """ sources = [Source.objects.get(pk=source_id)] \ if source_id else Source.objects.all() filename = 'source-{}.trig'.format(source_id if source_id else 'all') n_triples = 0 clear_graphs = [] with closing(TrigFile(filename)) as trig: meta_graph = trig.add_graph(PREFIXES['meta_graph']) for source in sources: # add triples for source metadata for quad in source_meta_quads(source): meta_graph.add_triple(quad) n_triples += 1 for dataset in source.datasets.all(): # add triples for dataset metadata for quad in dataset_meta_quads(dataset): meta_graph.add_triple(quad) n_triples += 1 for archive_item in dataset.archive_items.all(): # add triples for archive_item metadata for quad in archive_item_meta_quads(archive_item): meta_graph.add_triple(quad) n_triples += 1 data_graph = trig.add_graph( archive_item.datagraph_raw_name) clear_graphs.append(data_graph.name) # add triples for archive item for quad in archive_item_data_quads(archive_item): data_graph.add_triple(quad) n_triples += 1 from webui.cnmain.utils import get_virtuoso virtuoso = get_virtuoso() logger.debug('ingesting {} into virtuoso'.format(filename)) virtuoso.clear(clear_graphs) virtuoso.ingest(filename) return n_triples
def test_source_archive(self): Scheduler.objects.all().delete() ArchiveItem.objects.all().delete() source = Source.objects.get(name='in-giro (locale)') dataset = source.datasets.get() process_source.delay(source) events_item, poi_event = dataset.archive_items.all().\ order_by("file_hash") self._assert_archive_item( poi_event, (u'website', u'city', u'name', u'url', u'phone', u'address', u'location_type', u'description', u'province'), 158 ) self._assert_archive_item( events_item, (u'city', u'description', u'url', u'date', u'location', u'genre', u'location_url'), 497 ) from webui.cnmain.utils import get_virtuoso virtuoso = get_virtuoso() source_meta_id = source.metagraph_resource_id dataset_meta_id = dataset.metagraph_resource_id from rdflib import Namespace METAPROP = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['meta']) SDOWL = Namespace(settings.TRIPLE_DATABASE['PREFIXES']['sdowl']) RDF_TYPE = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type' self._assert_description(virtuoso, source_meta_id, [ (METAPROP['description'], source.description), (RDF_TYPE, SDOWL['Source'], 'iri'), ]) self._assert_description(virtuoso, dataset_meta_id, [ (METAPROP['download'], dataset.download), (RDF_TYPE, SDOWL['Dataset'], 'iri'), (SDOWL['belongs_to_source'], source_meta_id, 'iri'), ])
def __aggregator_process_archiveitem(aggregator_archive_item, scheduler, tmpdir, context): import envoy from django.template.loader import render_to_string from webui.cnmain.utils import get_virtuoso virtuoso_simple = get_virtuoso() virtuoso_master = get_virtuoso('master') loggy = local.logger aggregator = aggregator_archive_item.aggregator archive_item = aggregator_archive_item.archiveitem # # PART 1: generate XML file # loggy.debug("Processing " + unicode(archive_item)) output_filename = None if not aggregator.silk_rule: loggy.warning('No silk rule found, skipping') scheduler.status = Scheduler.INCOMPLETE else: output_filename = os.path.join(tmpdir, archive_item.file_hash + '.nt') conf_filename = os.path.join(tmpdir, archive_item.file_hash + '_conf.xml') silk_conf_xml = render_to_string( 'controller/aggregator/silk_rules.xml', dict(context, archive_item=archive_item, output_filename=output_filename)) with open(conf_filename, 'w') as fconf: fconf.write(silk_conf_xml) # # PART 2: execute SILK # loggy.info("Executing SILK on %s", unicode(archive_item)) result = envoy.connect( 'java -Xmx{} -DconfigFile={} -Dthreads={} ' '-cp "{}:{}/*" de.fuberlin.wiwiss.silk.Silk'.format( settings.SILK_SINGLE_MACHINE_HEAP, conf_filename, settings.SILK_SINGLE_MACHINE_THREADS, SILK_JAR_PATH, SILK_LIB_PATH, )) level = None status = 0 titan_log_cnt = 0 # pylint: disable=W0212 while result._process.poll() is None: line = result._process.stderr.readline()\ .strip().replace('%', '%%') if not line: continue tmplevel = line.split(":", 1)[0] if tmplevel in LEVEL_LIST: level = tmplevel if line.startswith("Exception in thread"): level = "EXCEPTION" if level == "EXCEPTION": status = 2 loggy.error("S> " + line) elif level in LEVEL_OUT: status = 1 loggy.warn("S> " + line) elif re.search(r"Finished writing \d+ entities", line) or \ re.search(r"Got \d+ vertices", line) or \ re.search(r"Wrote \d+ links", line): loggy.info("S> " + line) elif re.search(r"Getting data for vertices", line): if titan_log_cnt % 200 == 0: loggy.info("S> " + line) titan_log_cnt += 1 # pylint: enable=W0212 if status: loggy.error("SILK failed on %s", unicode(archive_item)) scheduler.status = Scheduler.FAIL if status == 2: return else: loggy.info("SILK executed successfully") # loggy.debug("Generated file: %s", output_filename) # # PART 3: dump graph data # dump_dir = '{}/'.format(archive_item.file_hash) loggy.info("Creating a dump of the namedgraph {}".format( archive_item.datagraph_mapped_name)) error = virtuoso_simple.dump_graph(archive_item.datagraph_mapped_name, dump_dir, create_dir=True) if error: loggy.error("Dump failed:") for line in error: loggy.error(line) raise Exception("Dump of the namedgraph failed: {}".format(error)) # # PART 4: load graph data in the master virtuoso instance # # we are assuming that the two virtuoso are on the same machine loggy.info("Loading dump in the master graph as {}".format( archive_item.datagraph_mapped_name)) # clear the entire named database before ingesting the data # since we're on titan we don't want this anymore # virtuoso_master.clear(archive_item.datagraph_mapped_name) # loggy.warning("Leaving data dump available for testing purposes") # error = virtuoso_master.load_graphs(dump_dir, remove_dir=False) error = virtuoso_master.load_graphs(dump_dir, remove_dir=True) if error: loggy.error("Load failed:") if isinstance(error, basestring): loggy.error(error) else: for line in error: loggy.error(line) raise Exception("Load of the namedgraph failed: {}".format(error)) if aggregator.silk_rule: # # PART 5: load SILK generated tuples # loggy.info("Loading SILK generated tuples") virtuoso_master.ingest( output_filename, settings.TRIPLE_DATABASE['PREFIXES']['silk_graph'], ) now = timezone.now() aggregator_archive_item.last_workflow_success = now if aggregator_archive_item.first_workflow_success is None: aggregator_archive_item.first_workflow_success = now aggregator_archive_item.save()
def get_context_data(self, **kwargs): from webui.cnmain.utils import get_virtuoso # pylint: disable=W0201 self.object = archive_item = self.get_object() context = super(ArchiveItemMappedStatsView, self).get_context_data( **kwargs ) context['archiveitems'] = [] context['object'] = archive_item graph = archive_item.datagraph_mapped_name queries = [] virtuoso = get_virtuoso() queries.append(('no_type', """ SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource ?b ?c . OPTIONAL { ?resource rdf:type ?d . } . FILTER (!BOUND(?d)) . } } """ % graph)) queries.append(('poi_no_achene', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:acheneID ?achene . } . FILTER (!BOUND(?achene)) . } } """ % graph)) queries.append(('poi_no_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:category ?cat . } . FILTER (!BOUND(?cat)) . } } """ % graph)) queries.append(('poi_no_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:category ?cat . } . FILTER (!BOUND(?cat)) . } } """ % graph)) queries.append(('poi_old_style_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:category ?cat . FILTER (0 = regex(?cat, "%s[0-9a-f]{40}")) . } } """ % (graph, settings.TRIPLE_DATABASE['PREFIXES']['sdres']))) queries.append(('poi_latlon_and_geom', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { { ?resource sd:latitude ?b } UNION { ?resource sd:longitude ?b } UNION { ?resource sd:geometry ?b } } } """ % graph)) queries.append(('poi_without_any_geometry', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource a sd:POI . OPTIONAL {?resource sd:geomPoint ?g1} . OPTIONAL {?resource sd:geomComplex ?g2} . FILTER (!BOUND(?g1)) FILTER (!BOUND(?g2)) } } """ % graph)) queries.append(('poi_point_without_extra_info', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:geomPoint ?b . OPTIONAL {?resource sd:geomPointProvenance ?prov} . OPTIONAL {?resource sd:geomPointAccuracy ?acc} . FILTER (!BOUND(?prov)) FILTER (!BOUND(?acc)) } } """ % graph)) queries.append(('poi_complex_without_extra_info', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:geomComplex ?b . OPTIONAL {?resource sd:geomComplexProvenance ?prov} . OPTIONAL {?resource sd:geomComplexAccuracy ?acc} . FILTER (!BOUND(?prov)) FILTER (!BOUND(?acc)) } } """ % graph)) queries.append(('poi_no_label', """ SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource rdfs:label ?label . } . FILTER (!BOUND(?label)) . } } """ % graph)) queries.append(('poi_no_name', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:name ?name . } . FILTER (!BOUND(?name)) . } } """ % graph)) queries.append(('poi_no_isinnuts', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:isInNUTS ?nuts . } . FILTER (!BOUND(?nuts)) . } } """ % graph)) queries.append(('poi_isinnuts_type', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT ?nutsType (count(distinct ?resource) AS ?cnt) WHERE { GRAPH <%s> { ?resource a sd:POI } . ?resource sd:isInNUTS ?nuts . OPTIONAL {?nuts a ?nutsType} } GROUP BY ?nutsType """ % graph)) results = { key: virtuoso.client_query(query).fetchall() for key, query in queries } context.update(results) return context
def __aggregator_process_archiveitem( aggregator_archive_item, scheduler, tmpdir, context): import envoy from django.template.loader import render_to_string from webui.cnmain.utils import get_virtuoso virtuoso_simple = get_virtuoso() virtuoso_master = get_virtuoso('master') loggy = local.logger aggregator = aggregator_archive_item.aggregator archive_item = aggregator_archive_item.archiveitem # # PART 1: generate XML file # loggy.debug("Processing " + unicode(archive_item)) output_filename = None if not aggregator.silk_rule: loggy.warning('No silk rule found, skipping') scheduler.status = Scheduler.INCOMPLETE else: output_filename = os.path.join( tmpdir, archive_item.file_hash + '.nt' ) conf_filename = os.path.join( tmpdir, archive_item.file_hash + '_conf.xml' ) silk_conf_xml = render_to_string( 'controller/aggregator/silk_rules.xml', dict(context, archive_item=archive_item, output_filename=output_filename) ) with open(conf_filename, 'w') as fconf: fconf.write(silk_conf_xml) # # PART 2: execute SILK # loggy.info("Executing SILK on %s", unicode(archive_item)) result = envoy.connect( 'java -Xmx{} -DconfigFile={} -Dthreads={} ' '-cp "{}:{}/*" de.fuberlin.wiwiss.silk.Silk'.format( settings.SILK_SINGLE_MACHINE_HEAP, conf_filename, settings.SILK_SINGLE_MACHINE_THREADS, SILK_JAR_PATH, SILK_LIB_PATH, ) ) level = None status = 0 titan_log_cnt = 0 # pylint: disable=W0212 while result._process.poll() is None: line = result._process.stderr.readline()\ .strip().replace('%', '%%') if not line: continue tmplevel = line.split(":", 1)[0] if tmplevel in LEVEL_LIST: level = tmplevel if line.startswith("Exception in thread"): level = "EXCEPTION" if level == "EXCEPTION": status = 2 loggy.error("S> " + line) elif level in LEVEL_OUT: status = 1 loggy.warn("S> " + line) elif re.search(r"Finished writing \d+ entities", line) or \ re.search(r"Got \d+ vertices", line) or \ re.search(r"Wrote \d+ links", line): loggy.info("S> " + line) elif re.search(r"Getting data for vertices", line): if titan_log_cnt % 200 == 0: loggy.info("S> " + line) titan_log_cnt += 1 # pylint: enable=W0212 if status: loggy.error("SILK failed on %s", unicode(archive_item)) scheduler.status = Scheduler.FAIL if status == 2: return else: loggy.info("SILK executed successfully") # loggy.debug("Generated file: %s", output_filename) # # PART 3: dump graph data # dump_dir = '{}/'.format(archive_item.file_hash) loggy.info("Creating a dump of the namedgraph {}".format( archive_item.datagraph_mapped_name)) error = virtuoso_simple.dump_graph( archive_item.datagraph_mapped_name, dump_dir, create_dir=True) if error: loggy.error("Dump failed:") for line in error: loggy.error(line) raise Exception("Dump of the namedgraph failed: {}".format( error )) # # PART 4: load graph data in the master virtuoso instance # # we are assuming that the two virtuoso are on the same machine loggy.info("Loading dump in the master graph as {}".format( archive_item.datagraph_mapped_name)) # clear the entire named database before ingesting the data # since we're on titan we don't want this anymore # virtuoso_master.clear(archive_item.datagraph_mapped_name) # loggy.warning("Leaving data dump available for testing purposes") # error = virtuoso_master.load_graphs(dump_dir, remove_dir=False) error = virtuoso_master.load_graphs(dump_dir, remove_dir=True) if error: loggy.error("Load failed:") if isinstance(error, basestring): loggy.error(error) else: for line in error: loggy.error(line) raise Exception("Load of the namedgraph failed: {}".format( error )) if aggregator.silk_rule: # # PART 5: load SILK generated tuples # loggy.info("Loading SILK generated tuples") virtuoso_master.ingest( output_filename, settings.TRIPLE_DATABASE['PREFIXES']['silk_graph'], ) now = timezone.now() aggregator_archive_item.last_workflow_success = now if aggregator_archive_item.first_workflow_success is None: aggregator_archive_item.first_workflow_success = now aggregator_archive_item.save()
def get_context_data(self, **kwargs): from webui.cnmain.utils import get_virtuoso # pylint: disable=W0201 self.object = archive_item = self.get_object() context = super(ArchiveItemMappedStatsView, self).get_context_data(**kwargs) context['archiveitems'] = [] context['object'] = archive_item graph = archive_item.datagraph_mapped_name queries = [] virtuoso = get_virtuoso() queries.append(('no_type', """ SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource ?b ?c . OPTIONAL { ?resource rdf:type ?d . } . FILTER (!BOUND(?d)) . } } """ % graph)) queries.append(('poi_no_achene', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:acheneID ?achene . } . FILTER (!BOUND(?achene)) . } } """ % graph)) queries.append(('poi_no_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:category ?cat . } . FILTER (!BOUND(?cat)) . } } """ % graph)) queries.append(('poi_no_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:category ?cat . } . FILTER (!BOUND(?cat)) . } } """ % graph)) queries.append(('poi_old_style_category', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:category ?cat . FILTER (0 = regex(?cat, "%s[0-9a-f]{40}")) . } } """ % (graph, settings.TRIPLE_DATABASE['PREFIXES']['sdres']))) queries.append(('poi_latlon_and_geom', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { { ?resource sd:latitude ?b } UNION { ?resource sd:longitude ?b } UNION { ?resource sd:geometry ?b } } } """ % graph)) queries.append(('poi_without_any_geometry', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource a sd:POI . OPTIONAL {?resource sd:geomPoint ?g1} . OPTIONAL {?resource sd:geomComplex ?g2} . FILTER (!BOUND(?g1)) FILTER (!BOUND(?g2)) } } """ % graph)) queries.append(('poi_point_without_extra_info', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:geomPoint ?b . OPTIONAL {?resource sd:geomPointProvenance ?prov} . OPTIONAL {?resource sd:geomPointAccuracy ?acc} . FILTER (!BOUND(?prov)) FILTER (!BOUND(?acc)) } } """ % graph)) queries.append(('poi_complex_without_extra_info', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource sd:geomComplex ?b . OPTIONAL {?resource sd:geomComplexProvenance ?prov} . OPTIONAL {?resource sd:geomComplexAccuracy ?acc} . FILTER (!BOUND(?prov)) FILTER (!BOUND(?acc)) } } """ % graph)) queries.append(('poi_no_label', """ SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource rdfs:label ?label . } . FILTER (!BOUND(?label)) . } } """ % graph)) queries.append(('poi_no_name', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:name ?name . } . FILTER (!BOUND(?name)) . } } """ % graph)) queries.append(('poi_no_isinnuts', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT (count(distinct ?resource) as ?count) WHERE { GRAPH <%s> { ?resource rdf:type sd:POI . OPTIONAL { ?resource sd:isInNUTS ?nuts . } . FILTER (!BOUND(?nuts)) . } } """ % graph)) queries.append(('poi_isinnuts_type', """ PREFIX sd:<http://ontologies.venturi.eu/v1#> SELECT ?nutsType (count(distinct ?resource) AS ?cnt) WHERE { GRAPH <%s> { ?resource a sd:POI } . ?resource sd:isInNUTS ?nuts . OPTIONAL {?nuts a ?nutsType} } GROUP BY ?nutsType """ % graph)) results = { key: virtuoso.client_query(query).fetchall() for key, query in queries } context.update(results) return context
def setUpClass(cls): from webui.cnmain.utils import get_virtuoso cls.virtuoso = get_virtuoso() cls.virtuoso_master = get_virtuoso('master')