def load_vocabulary(self, transform_manager, prefix, uri): overrides = getattr(settings, "VOCABULARY_URL_OVERRIDES", {}) uri = overrides.get(prefix, uri) if not uri: return filename, headers = retrieve(uri) if not filename: logger.error("Unable to retrieve: %s", headers.get("message")) return try: logger.debug("About to fetch %r for vocabulary %r", uri, prefix) if headers["status"] != httplib.OK: logger.error("Failed to retrieve %r for vocabulary %r", uri, prefix, extra={"headers": headers}) return content_type = headers["content-type"].split(";")[0] if content_type not in ("application/rdf+xml", "text/n3", "text/plain", "text/turtle"): logger.error("Unexpected content-type: %r", content_type) return graph_name = settings.GRAPH_BASE + "vocabulary/" + prefix Uploader.upload( stores=(transform_manager.store,), graph_name=graph_name, filename=filename, mimetype=content_type ) finally: if headers["delete-after"]: os.unlink(filename)
def accept_pingback(pingback): uploader = Uploader() uploader.upload(store=pingback.store, graph_name=pingback.graph_name, data=pingback.data, mimetype='text/n3') pingback.mark_published()
def execute(self, transform_manager, input): transform_manager.start(self, [input]) logger.debug("Starting upload of %r", input) client = self.get_redis_client() extension = input.rsplit('.', 1)[-1] try: serializer = self.formats[extension] except KeyError: logger.exception("Unrecognized RDF extension: %r", extension) raise graph = rdflib.ConjunctiveGraph() graph.parse(open(input, 'r'), format=serializer, publicID=self.graph_name) logger.debug("Parsed graph") datetime_now = self.site_timezone.localize(datetime.datetime.now().replace(microsecond=0)) modified = graph.value(self.graph_name, NS['dcterms'].modified, default=rdflib.Literal(datetime_now)) created = graph.value(self.graph_name, NS['dcterms'].created) if not created: logger.debug("Getting created date from %r", transform_manager.store.query_endpoint) endpoint = Endpoint(transform_manager.store.query_endpoint) results = list(endpoint.query(self.created_query % {'graph': self.graph_name.n3()})) if results: created = results[0].date else: created = modified graph += ( (self.graph_name, NS['dcterms'].modified, modified), (self.graph_name, NS['dcterms'].created, created), ) logger.debug("About to serialize") output = transform_manager('rdf') with open(output, 'w') as f: graph.serialize(f) logger.debug("Serialization done; about to upload") uploader = Uploader() uploader.upload(stores=(transform_manager.store,), graph_name=self.graph_name, filename=output, method=self.method, mimetype='application/rdf+xml') logger.debug("Upload complete") transform_manager.end([self.graph_name]) transform_manager.touched_graph(self.graph_name)
def archive(self): notation = self.notation or hashlib.sha1(self.dataset).hexdigest() archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-')) archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation)) data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) if not os.path.exists(archive_path): os.makedirs(archive_path, 0755) nt_fd, nt_name = tempfile.mkstemp('.nt') rdf_fd, rdf_name = tempfile.mkstemp('.rdf') try: nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w') for graph_name in self.graph_names: self._graph_triples(nt_out, graph_name) nt_out.close() sort = subprocess.Popen(['sort', '-u', nt_name], stdout=subprocess.PIPE) try: triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), archive_graph_name), parse(sort.stdout, 'nt').get_triples()) serialize(triples, rdf_out, rdf_name) finally: # Make sure stdout gets closed so that if the try block raises # an exception we don't keep a sort process hanging around. sort.stdout.close() sort.wait() rdf_out.close() previous_name = os.path.join(archive_path, 'latest.rdf') # Only update if the file has changed, or hasn't been archived before. if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name): new_name = os.path.join(archive_path, self.updated.astimezone(pytz.utc).isoformat() + '.rdf') shutil.move(rdf_name, new_name) os.chmod(new_name, 0644) if os.path.exists(previous_name): os.unlink(previous_name) os.symlink(new_name, previous_name) # Upload the metadata to the store using an absolute URI. metadata = self._get_metadata(data_dump_url, archive_graph_name) Uploader.upload([self.store], archive_graph_name, graph=metadata) finally: os.unlink(nt_name) if os.path.exists(rdf_name): os.unlink(rdf_name) self.filter_old_archives(archive_path)
def load_vocabulary(self, transform_manager, prefix, uri): overrides = getattr(settings, 'VOCABULARY_URL_OVERRIDES', {}) uri = overrides.get(prefix, uri) if not uri: return filename, headers = retrieve(uri) if not filename: logger.error("Unable to retrieve: %s", headers.get('message')) return try: logger.debug("About to fetch %r for vocabulary %r", uri, prefix) if headers['status'] != httplib.OK: logger.error("Failed to retrieve %r for vocabulary %r", uri, prefix, extra={'headers': headers}) return content_type = headers['content-type'].split(';')[0] if content_type not in ('application/rdf+xml', 'text/n3', 'text/plain', 'text/turtle'): logger.error('Unexpected content-type: %r', content_type) return graph_name = settings.GRAPH_BASE + 'vocabulary/' + prefix Uploader.upload(stores=(transform_manager.store, ), graph_name=graph_name, filename=filename, mimetype=content_type) finally: if headers['delete-after']: os.unlink(filename)
def execute(self, transform_manager, input): transform_manager.start(self, [input]) logger.debug("Starting upload of %r", input) extension = input.rsplit('.', 1)[-1] try: serializer = self.formats[extension] except KeyError: logger.exception("Unrecognized RDF extension: %r", extension) raise graph = rdflib.ConjunctiveGraph() graph.parse(open(input, 'r'), format=serializer, publicID=self.graph_name) logger.debug("Parsed graph") datetime_now = self.site_timezone.localize( datetime.datetime.now().replace(microsecond=0)) modified = graph.value(self.graph_name, NS['dcterms'].modified, default=rdflib.Literal(datetime_now)) created = graph.value(self.graph_name, NS['dcterms'].created) if not created: logger.debug("Getting created date from %r", transform_manager.store.query_endpoint) endpoint = Endpoint(transform_manager.store.query_endpoint) results = list( endpoint.query(self.created_query % {'graph': self.graph_name.n3()})) if results: created = results[0].date else: created = modified graph += ( (self.graph_name, NS.rdf.type, NS.sd.Graph), (self.graph_name, NS.dcterms.modified, modified), (self.graph_name, NS.dcterms.created, created), ) logger.debug("About to serialize") output = transform_manager('rdf') with open(output, 'w') as f: graph.serialize(f) logger.debug("Serialization done; about to upload") uploader = Uploader() uploader.upload(stores=(transform_manager.store, ), graph_name=self.graph_name, filename=output, method=self.method, mimetype='application/rdf+xml') logger.debug("Upload complete") transform_manager.end([self.graph_name]) transform_manager.touched_graph(self.graph_name)
def archive(self): notation = self.notation or hashlib.sha1(self.dataset).hexdigest() archive_path = os.path.join(SOURCE_DIRECTORY, 'archive', self.store.slug, notation.replace('/', '-')) archive_graph_name = rdflib.URIRef('{0}archive/{1}'.format(settings.GRAPH_BASE, notation)) data_dump_url = rdflib.URIRef('{0}archive/{1}/{2}/latest.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) data_dump_with_labels_url = rdflib.URIRef('{0}archive/{1}/{2}/latest-with-labels.rdf'.format(SOURCE_URL, self.store.slug, notation.replace('/', '-'))) if not os.path.exists(archive_path): os.makedirs(archive_path, 0755) nt_fd, nt_name = tempfile.mkstemp('.nt') rdf_fd, rdf_name = tempfile.mkstemp('.rdf') rdf_with_labels_fd, rdf_with_labels_name = tempfile.mkstemp('.rdf') try: nt_out, rdf_out = os.fdopen(nt_fd, 'w'), os.fdopen(rdf_fd, 'w') rdf_with_labels_out = os.fdopen(rdf_with_labels_fd, 'w') for graph_name in self.graph_names: self._graph_triples(nt_out, graph_name) nt_out.close() with tempfile.TemporaryFile() as sorted_triples: subprocess.call(['sort', '-u', nt_name], stdout=sorted_triples) sorted_triples.seek(0) triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), data_dump_with_labels_url, archive_graph_name), parse(sorted_triples, 'nt').get_triples()) serialize(triples, rdf_out, 'rdf') rdf_out.close() sorted_triples.seek(0) triples = itertools.chain(self._get_metadata(rdflib.URIRef(''), data_dump_with_labels_url, archive_graph_name), self.with_labels(parse(sorted_triples, 'nt').get_triples())) serialize(triples, rdf_with_labels_out, 'rdf') rdf_with_labels_out.close() previous_name = os.path.join(archive_path, 'latest.rdf') # Only update if the file has changed, or hasn't been archived before. if not os.path.exists(previous_name) or not filecmp._do_cmp(previous_name, rdf_name): new_name = os.path.join(archive_path, self.updated.astimezone(pytz.utc).isoformat() + '.rdf') shutil.move(rdf_name, new_name) os.chmod(new_name, 0644) if os.path.exists(previous_name): os.unlink(previous_name) os.symlink(new_name, previous_name) new_with_labels_name = os.path.join(archive_path, 'latest-with-labels.rdf') shutil.move(rdf_with_labels_name, new_with_labels_name) os.chmod(new_with_labels_name, 0644) # Upload the metadata to the store using an absolute URI. metadata = self._get_metadata(data_dump_url, data_dump_with_labels_url, archive_graph_name) Uploader.upload([self.store], archive_graph_name, graph=metadata) finally: os.unlink(nt_name) if os.path.exists(rdf_name): os.unlink(rdf_name) self.filter_old_archives(archive_path)