def update_dataset_archives(sender, update_definition, store_graphs, when, **kwargs): for store in store_graphs: graph_names = store_graphs[store] endpoint = Endpoint(store.query_endpoint) if DATASET_NOTATION: notation_clause = """ OPTIONAL {{ ?dataset skos:notation ?notation . FILTER (DATATYPE(?notation) = {0} ) }}""".format(DATASET_NOTATION.n3()) else: notation_clause = "" query = """ SELECT ?dataset ?notation WHERE {{ VALUES ?graph {{ {0} }} ?graph void:inDataset ?dataset . {1} }}""".format(" ".join(g.n3() for g in graph_names), notation_clause) datasets = dict(endpoint.query(query)) logger.debug("Found %d datasets to archive", len(datasets)) for dataset in datasets: logger.debug("Archiving dataset: %s", dataset) notation = datasets[dataset] archiver = DatasetArchiver(store, dataset, notation, when) archiver.archive()
def execute(self, transform_manager, input): transform_manager.start(self, [input]) logger.debug("Starting upload of %r", input) client = self.get_redis_client() extension = input.rsplit('.', 1)[-1] try: serializer = self.formats[extension] except KeyError: logger.exception("Unrecognized RDF extension: %r", extension) raise graph = rdflib.ConjunctiveGraph() graph.parse(open(input, 'r'), format=serializer, publicID=self.graph_name) logger.debug("Parsed graph") datetime_now = self.site_timezone.localize(datetime.datetime.now().replace(microsecond=0)) modified = graph.value(self.graph_name, NS['dcterms'].modified, default=rdflib.Literal(datetime_now)) created = graph.value(self.graph_name, NS['dcterms'].created) if not created: logger.debug("Getting created date from %r", transform_manager.store.query_endpoint) endpoint = Endpoint(transform_manager.store.query_endpoint) results = list(endpoint.query(self.created_query % {'graph': self.graph_name.n3()})) if results: created = results[0].date else: created = modified graph += ( (self.graph_name, NS['dcterms'].modified, modified), (self.graph_name, NS['dcterms'].created, created), ) logger.debug("About to serialize") output = transform_manager('rdf') with open(output, 'w') as f: graph.serialize(f) logger.debug("Serialization done; about to upload") uploader = Uploader() uploader.upload(stores=(transform_manager.store,), graph_name=self.graph_name, filename=output, method=self.method, mimetype='application/rdf+xml') logger.debug("Upload complete") transform_manager.end([self.graph_name]) transform_manager.touched_graph(self.graph_name)
def execute(self, transform_manager): endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain',)) if isinstance(self.query, basestring): query = self.query else: query_filename = self.query.execute(transform_manager) with open(query_filename, 'r') as query_file: query = query_file.read() with open(transform_manager('nt'), 'w') as output: transform_manager.start(self, []) serialize(endpoint.query(query, defer=True), output) transform_manager.end([output.name]) return output.name
def execute(self, transform_manager): endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain', )) if isinstance(self.query, basestring): query = self.query else: query_filename = self.query.execute(transform_manager) with open(query_filename, 'r') as query_file: query = query_file.read() with open(transform_manager('nt'), 'w') as output: transform_manager.start(self, []) serialize(endpoint.query(query, defer=True), output) transform_manager.end([output.name]) return output.name
def update_dataset_archives(update_log, graphs, updated): if not getattr(settings, 'ARCHIVE_PATH', None): return updated = updated.replace(microsecond=0) for store_slug in graphs: store = Store.objects.get(slug=store_slug) graph_names = graphs[store.slug] endpoint = Endpoint(store.query_endpoint) query = "SELECT ?dataset WHERE { %s }" % " UNION ".join("{ %s void:inDataset ?dataset }" % g.n3() for s, g in graph_names if s is None) datasets = set(r['dataset'] for r in endpoint.query(query)) for dataset in datasets: query = "SELECT ?graph WHERE { ?graph void:inDataset %s }" % dataset.n3() graphs = set(r['graph'] for r in endpoint.query(query)) update_dataset_archive(dataset, store, graph_names, updated)
def update_for_store(self, index, store): hash_key = 'humfrey:elasticsearch:indices:%s:%s' % (index.slug, store.slug) endpoint = Endpoint(store.query_endpoint) results = endpoint.query(index.query) try: urllib2.urlopen(index.get_index_status_url(store)) index_exists = True except urllib2.HTTPError, e: if e.code == httplib.NOT_FOUND: index_exists = False index.update_mapping = True request = urllib2.Request(index.get_index_url(store)) request.get_method = lambda: 'PUT' urllib2.urlopen(request) else: raise
def execute(self, transform_manager): endpoint = Endpoint(transform_manager.store.query_endpoint) if isinstance(self.query, basestring): query = self.query else: query_filename = self.query.execute(transform_manager) with open(query_filename, 'r') as query_file: query = query_file.read() with open(transform_manager('nt'), 'w') as output: transform_manager.start(self, []) result = endpoint.query(query) result.serialize(output, 'nt') transform_manager.end([output.name]) return output.name
class DatasetArchiver(object): def __init__(self, store, dataset, notation, updated): self.store = store self.dataset = dataset self.notation = notation self.updated = updated.replace(microsecond=0) self.endpoint = Endpoint(store.query_endpoint) @property def graph_names(self): if not hasattr(self, '_graphs'): query = "SELECT ?graph WHERE {{ ?graph void:inDataset/^void:subset* {0} }}".format(self.dataset.n3()) self._graphs = set(r['graph'] for r in self.endpoint.query(query)) return self._graphs def _graph_created(self, graph_name): query = "SELECT ?created WHERE {{ {0} dcterms:created ?created }}".format(graph_name.n3()) results = self.endpoint.query(query) if results: return results[0].created else: return rdflib.Literal(self.updated) def _graph_triples(self, out, graph_name): url = '%s?%s' % (self.store.graph_store_endpoint, urllib.urlencode({'graph': graph_name})) request = urllib2.Request(url) request.add_header('Accept', 'text/plain') try: response = urllib2.urlopen(request) except urllib2.HTTPError, e: if e.code == 404: logger.warning("Graph not found: %s", graph_name) else: logger.exception("HTTPError %d for %s: %s", e.code, graph_name, e.read()) return while True: chunk = response.read(4096) if not chunk: break out.write(chunk)
def update_for_store(self, index, store): hash_key = 'humfrey:elasticsearch:indices:%s:%s' % (index.slug, store.slug) endpoint = Endpoint(store.query_endpoint) logger.debug("Performing SPARQL query.", extra={'query': index.query}) results = endpoint.query(index.query, defer=True).get_bindings() logger.debug("SPARQL server started returning results.") try: urllib2.urlopen(index.get_index_status_url(store)) index_exists = True except urllib2.HTTPError, e: if e.code == httplib.NOT_FOUND: logger.info("Index %s/%s did not previously exist", store.slug, index.slug) index_exists = False index.update_mapping = True request = urllib2.Request(index.get_index_url(store)) request.get_method = lambda: 'PUT' urllib2.urlopen(request) else: raise
def endpoint(self): if not hasattr(self, '_endpoint'): if isinstance(self, ContentNegotiatedView): self.set_renderers(early=True) preferred_media_types = [ m.value for r in self.request.renderers for m in r.mimetypes ] else: preferred_media_types = () self._endpoint = Endpoint( self.store.query_endpoint, preferred_media_types=preferred_media_types) return self._endpoint
def execute(self, transform_manager, input): transform_manager.start(self, []) endpoint = Endpoint(transform_manager.store.query_endpoint) for normalization in self.normalizations: normalization.endpoint = endpoint normalization.store = transform_manager.store while self.normalizations: with open(input, 'r') as source: pipeline = parse(source).get_triples() for normalization in self.normalizations: pipeline = normalization(pipeline) with open(transform_manager('rdf'), 'w') as target: serialize(pipeline, target) input = target.name self.normalizations = [ n for n in self.normalizations if not n.done ] return input
def execute(self, transform_manager, input): transform_manager.start(self, [input]) logger.debug("Starting upload of %r", input) extension = input.rsplit('.', 1)[-1] try: serializer = self.formats[extension] except KeyError: logger.exception("Unrecognized RDF extension: %r", extension) raise graph = rdflib.ConjunctiveGraph() graph.parse(open(input, 'r'), format=serializer, publicID=self.graph_name) logger.debug("Parsed graph") datetime_now = self.site_timezone.localize( datetime.datetime.now().replace(microsecond=0)) modified = graph.value(self.graph_name, NS['dcterms'].modified, default=rdflib.Literal(datetime_now)) created = graph.value(self.graph_name, NS['dcterms'].created) if not created: logger.debug("Getting created date from %r", transform_manager.store.query_endpoint) endpoint = Endpoint(transform_manager.store.query_endpoint) results = list( endpoint.query(self.created_query % {'graph': self.graph_name.n3()})) if results: created = results[0].date else: created = modified graph += ( (self.graph_name, NS.rdf.type, NS.sd.Graph), (self.graph_name, NS.dcterms.modified, modified), (self.graph_name, NS.dcterms.created, created), ) logger.debug("About to serialize") output = transform_manager('rdf') with open(output, 'w') as f: graph.serialize(f) logger.debug("Serialization done; about to upload") uploader = Uploader() uploader.upload(stores=(transform_manager.store, ), graph_name=self.graph_name, filename=output, method=self.method, mimetype='application/rdf+xml') logger.debug("Upload complete") transform_manager.end([self.graph_name]) transform_manager.touched_graph(self.graph_name)
def upload_dataset_metadata(sender, store, graphs, when, **kwargs): if store.slug != DEFAULT_STORE_SLUG: return if not graphs: logger.debug("No graphs updated for %r; aborting", store.slug) return if not getattr(settings, 'CKAN_API_KEY', None): logger.debug("No CKAN_API_KEY setting, not doing anything.") return client = ckanclient.CkanClient(api_key=settings.CKAN_API_KEY) endpoint = Endpoint(settings.ENDPOINT_QUERY) query = _dataset_query % ' \n'.join('(%s)' % rdflib.URIRef(g).n3() for g in graphs) graph = endpoint.query(query) datasets = list(graph.subjects(NS.rdf.type, NS.void.Dataset)) if len(datasets) != 1: logger.debug("Expected one dataset; got %d", len(datasets)) return dataset = Resource(datasets[0], graph, endpoint) find = functools.partial(_find, graph, dataset._identifier) patterns = settings.CKAN_PATTERNS package_name = find('skos:notation', HUMFREY.theDataHubDatasetName) if not package_name: return #package_name = patterns.get('name', '%s') % slug package_title = patterns.get('title', '%s') % dataset.label author = find('dcterms:publisher/foaf:name|rdfs:label|dc:title|skos:prefLabel|dcterms:title') if author: author = patterns.get('author', '%s') % author description = find('rdfs:comment|dcterms:description', (NS.xtypes['Fragment-Markdown'], NS.xtypes['Fragment-PlainText'], None)) maintainer = find('oo:contact/foaf:name|rdfs:label|dc:title|skos:prefLabel|dcterms:title') if maintainer: maintainer = patterns.get('maintainer', '%s') % maintainer maintainer_email = find('oo:contact/foaf:mbox|v:email') if maintainer_email: maintainer_email = maintainer_email.replace('mailto:', '') license = find('dcterms:license|cc:license') if license: license = _licenses.get(unicode(license)) sparql_endpoint = find('void:sparqlEndpoint') if sparql_endpoint: sparql_endpoint = unicode(sparql_endpoint) else: sparql_endpoint = 'http:' + reverse_full('data', 'sparql:endpoint') tags = find('humfrey:theDataHubDatasetTag', all=True) groups = find('humfrey:theDataHubDatasetGroup', all=True) url = doc_forward(dataset.uri) logger.debug("Fetching existing record for %r", package_name) try: package_entity = client.package_entity_get(package_name) logger.debug("Record successfully retrieved") except ckanclient.CkanApiNotFoundError: package_entity = {'name': package_name} client.package_register_post(package_entity) logger.debug("No record found; starting from empty") original = copy.deepcopy(package_entity) package_entity.update({'name': package_name, 'title': package_title, 'url': url, 'notes': description, 'license_id': license, 'author': author, 'maintainer': maintainer, 'maintainer_email': dataset.oo_contact.get_one_of('foaf:mbox', 'v:email').replace('mailto:', '', 1)}) package_entity['groups'] = list(settings.CKAN_GROUPS | set(package_entity.get('groups', ())) | groups) package_entity['tags'] = list(settings.CKAN_TAGS | set(package_entity.get('tags', ())) | tags) resources = collections.defaultdict(dict, ((r.get('name'), r) for r in package_entity.get('resources', ()))) resources['SPARQL endpoint'].update({'name': 'SPARQL endpoint', 'format': 'api/sparql', 'url': sparql_endpoint}) package_entity['resources'] = resources.values() logger.debug("Updated CKAN record") if original != package_entity: logger.info("Updating %r at thedatahub.org", package_name) client.package_entity_put(package_entity)
def __init__(self, store, dataset, notation, updated): self.store = store self.dataset = dataset self.notation = notation self.updated = updated.replace(microsecond=0) self.endpoint = Endpoint(store.query_endpoint)