Exemple #1
0
def update_dataset_archives(sender, update_definition, store_graphs, when, **kwargs):
    for store in store_graphs:
        graph_names = store_graphs[store]
        endpoint = Endpoint(store.query_endpoint)

        if DATASET_NOTATION:
            notation_clause = """
                OPTIONAL {{ ?dataset skos:notation ?notation .
                FILTER (DATATYPE(?notation) = {0} ) }}""".format(DATASET_NOTATION.n3())
        else:
            notation_clause = ""

        query = """
        SELECT ?dataset ?notation WHERE {{
          VALUES ?graph {{ {0} }}
          ?graph void:inDataset ?dataset .
          {1}
        }}""".format(" ".join(g.n3() for g in graph_names),
                     notation_clause)
        datasets = dict(endpoint.query(query))

        logger.debug("Found %d datasets to archive", len(datasets))
        for dataset in datasets:
            logger.debug("Archiving dataset: %s", dataset)
            notation = datasets[dataset]
            archiver = DatasetArchiver(store, dataset, notation, when)
            archiver.archive()
Exemple #2
0
    def execute(self, transform_manager, input):
        transform_manager.start(self, [input])

        logger.debug("Starting upload of %r", input)

        client = self.get_redis_client()

        extension = input.rsplit('.', 1)[-1]
        try:
            serializer = self.formats[extension]
        except KeyError:
            logger.exception("Unrecognized RDF extension: %r", extension)
            raise

        graph = rdflib.ConjunctiveGraph()
        graph.parse(open(input, 'r'),
                    format=serializer,
                    publicID=self.graph_name)

        logger.debug("Parsed graph")

        datetime_now = self.site_timezone.localize(datetime.datetime.now().replace(microsecond=0))
        modified = graph.value(self.graph_name, NS['dcterms'].modified,
                               default=rdflib.Literal(datetime_now))
        created = graph.value(self.graph_name, NS['dcterms'].created)
        if not created:
            logger.debug("Getting created date from %r", transform_manager.store.query_endpoint)
            endpoint = Endpoint(transform_manager.store.query_endpoint)
            results = list(endpoint.query(self.created_query % {'graph': self.graph_name.n3()}))
            if results:
                created = results[0].date
            else:
                created = modified

        graph += (
            (self.graph_name, NS['dcterms'].modified, modified),
            (self.graph_name, NS['dcterms'].created, created),
        )

        logger.debug("About to serialize")

        output = transform_manager('rdf')
        with open(output, 'w') as f:
            graph.serialize(f)

        logger.debug("Serialization done; about to upload")

        uploader = Uploader()
        uploader.upload(stores=(transform_manager.store,),
                        graph_name=self.graph_name,
                        filename=output,
                        method=self.method,
                        mimetype='application/rdf+xml')

        logger.debug("Upload complete")

        transform_manager.end([self.graph_name])
        transform_manager.touched_graph(self.graph_name)
Exemple #3
0
    def execute(self, transform_manager):

        endpoint = Endpoint(transform_manager.store.query_endpoint, preferred_media_types=('text/plain',))

        if isinstance(self.query, basestring):
            query = self.query
        else:
            query_filename = self.query.execute(transform_manager)
            with open(query_filename, 'r') as query_file:
                query = query_file.read()

        with open(transform_manager('nt'), 'w') as output:
            transform_manager.start(self, [])
            serialize(endpoint.query(query, defer=True), output)
            transform_manager.end([output.name])
        return output.name
Exemple #4
0
    def execute(self, transform_manager):

        endpoint = Endpoint(transform_manager.store.query_endpoint,
                            preferred_media_types=('text/plain', ))

        if isinstance(self.query, basestring):
            query = self.query
        else:
            query_filename = self.query.execute(transform_manager)
            with open(query_filename, 'r') as query_file:
                query = query_file.read()

        with open(transform_manager('nt'), 'w') as output:
            transform_manager.start(self, [])
            serialize(endpoint.query(query, defer=True), output)
            transform_manager.end([output.name])
        return output.name
Exemple #5
0
def update_dataset_archives(update_log, graphs, updated):
    if not getattr(settings, 'ARCHIVE_PATH', None):
        return

    updated = updated.replace(microsecond=0)
    
    for store_slug in graphs:
        store = Store.objects.get(slug=store_slug)
        graph_names = graphs[store.slug]
        endpoint = Endpoint(store.query_endpoint)

        query = "SELECT ?dataset WHERE { %s }" % " UNION ".join("{ %s void:inDataset ?dataset }" % g.n3() for s, g in graph_names if s is None)
        datasets = set(r['dataset'] for r in endpoint.query(query))
    
        for dataset in datasets:
            query = "SELECT ?graph WHERE { ?graph void:inDataset %s }" % dataset.n3()
            graphs = set(r['graph'] for r in endpoint.query(query))
            update_dataset_archive(dataset, store, graph_names, updated)
Exemple #6
0
    def update_for_store(self, index, store):
        hash_key = 'humfrey:elasticsearch:indices:%s:%s' % (index.slug, store.slug)
        endpoint = Endpoint(store.query_endpoint)
        results = endpoint.query(index.query)

        try:
            urllib2.urlopen(index.get_index_status_url(store))
            index_exists = True
        except urllib2.HTTPError, e:
            if e.code == httplib.NOT_FOUND:
                index_exists = False
                index.update_mapping = True

                request = urllib2.Request(index.get_index_url(store))
                request.get_method = lambda: 'PUT'
                urllib2.urlopen(request)
            else:
                raise
Exemple #7
0
    def execute(self, transform_manager):

        endpoint = Endpoint(transform_manager.store.query_endpoint)

        if isinstance(self.query, basestring):
            query = self.query
        else:
            query_filename = self.query.execute(transform_manager)
            with open(query_filename, 'r') as query_file:
                query = query_file.read()

        with open(transform_manager('nt'), 'w') as output:
            transform_manager.start(self, [])
            result = endpoint.query(query)

            result.serialize(output, 'nt')

            transform_manager.end([output.name])
        return output.name
Exemple #8
0
class DatasetArchiver(object):

    def __init__(self, store, dataset, notation, updated):
        self.store = store
        self.dataset = dataset
        self.notation = notation
        self.updated = updated.replace(microsecond=0)
        self.endpoint = Endpoint(store.query_endpoint)

    @property
    def graph_names(self):
        if not hasattr(self, '_graphs'):
            query = "SELECT ?graph WHERE {{ ?graph void:inDataset/^void:subset* {0} }}".format(self.dataset.n3())
            self._graphs = set(r['graph'] for r in self.endpoint.query(query))
        return self._graphs

    def _graph_created(self, graph_name):
        query = "SELECT ?created WHERE {{ {0} dcterms:created ?created }}".format(graph_name.n3())
        results = self.endpoint.query(query)
        if results:
            return results[0].created
        else:
            return rdflib.Literal(self.updated)

    def _graph_triples(self, out, graph_name):
        url = '%s?%s' % (self.store.graph_store_endpoint,
                         urllib.urlencode({'graph': graph_name}))
        request = urllib2.Request(url)
        request.add_header('Accept', 'text/plain')
        try:
            response = urllib2.urlopen(request)
        except urllib2.HTTPError, e:
            if e.code == 404:
                logger.warning("Graph not found: %s", graph_name)
            else:
                logger.exception("HTTPError %d for %s: %s", e.code, graph_name, e.read())
            return
        while True:
            chunk = response.read(4096)
            if not chunk:
                break
            out.write(chunk)
Exemple #9
0
    def update_for_store(self, index, store):
        hash_key = 'humfrey:elasticsearch:indices:%s:%s' % (index.slug, store.slug)
        endpoint = Endpoint(store.query_endpoint)

        logger.debug("Performing SPARQL query.", extra={'query': index.query})
        results = endpoint.query(index.query, defer=True).get_bindings()
        logger.debug("SPARQL server started returning results.")

        try:
            urllib2.urlopen(index.get_index_status_url(store))
            index_exists = True
        except urllib2.HTTPError, e:
            if e.code == httplib.NOT_FOUND:
                logger.info("Index %s/%s did not previously exist", store.slug, index.slug)
                index_exists = False
                index.update_mapping = True

                request = urllib2.Request(index.get_index_url(store))
                request.get_method = lambda: 'PUT'
                urllib2.urlopen(request)
            else:
                raise
Exemple #10
0
 def endpoint(self):
     if not hasattr(self, '_endpoint'):
         if isinstance(self, ContentNegotiatedView):
             self.set_renderers(early=True)
             preferred_media_types = [
                 m.value for r in self.request.renderers
                 for m in r.mimetypes
             ]
         else:
             preferred_media_types = ()
         self._endpoint = Endpoint(
             self.store.query_endpoint,
             preferred_media_types=preferred_media_types)
     return self._endpoint
Exemple #11
0
    def update_for_store(self, index, store):
        hash_key = 'humfrey:elasticsearch:indices:%s:%s' % (index.slug,
                                                            store.slug)
        endpoint = Endpoint(store.query_endpoint)

        logger.debug("Performing SPARQL query.", extra={'query': index.query})
        results = endpoint.query(index.query, defer=True).get_bindings()
        logger.debug("SPARQL server started returning results.")

        try:
            urllib2.urlopen(index.get_index_status_url(store))
            index_exists = True
        except urllib2.HTTPError, e:
            if e.code == httplib.NOT_FOUND:
                logger.info("Index %s/%s did not previously exist", store.slug,
                            index.slug)
                index_exists = False
                index.update_mapping = True

                request = urllib2.Request(index.get_index_url(store))
                request.get_method = lambda: 'PUT'
                urllib2.urlopen(request)
            else:
                raise
Exemple #12
0
    def execute(self, transform_manager, input):
        transform_manager.start(self, [])

        endpoint = Endpoint(transform_manager.store.query_endpoint)

        for normalization in self.normalizations:
            normalization.endpoint = endpoint
            normalization.store = transform_manager.store

        while self.normalizations:
            with open(input, 'r') as source:
                pipeline = parse(source).get_triples()
                for normalization in self.normalizations:
                    pipeline = normalization(pipeline)
                with open(transform_manager('rdf'), 'w') as target:
                    serialize(pipeline, target)

            input = target.name
            self.normalizations = [
                n for n in self.normalizations if not n.done
            ]

        return input
Exemple #13
0
    def execute(self, transform_manager, input):
        transform_manager.start(self, [input])

        logger.debug("Starting upload of %r", input)

        extension = input.rsplit('.', 1)[-1]
        try:
            serializer = self.formats[extension]
        except KeyError:
            logger.exception("Unrecognized RDF extension: %r", extension)
            raise

        graph = rdflib.ConjunctiveGraph()
        graph.parse(open(input, 'r'),
                    format=serializer,
                    publicID=self.graph_name)

        logger.debug("Parsed graph")

        datetime_now = self.site_timezone.localize(
            datetime.datetime.now().replace(microsecond=0))
        modified = graph.value(self.graph_name,
                               NS['dcterms'].modified,
                               default=rdflib.Literal(datetime_now))
        created = graph.value(self.graph_name, NS['dcterms'].created)
        if not created:
            logger.debug("Getting created date from %r",
                         transform_manager.store.query_endpoint)
            endpoint = Endpoint(transform_manager.store.query_endpoint)
            results = list(
                endpoint.query(self.created_query %
                               {'graph': self.graph_name.n3()}))
            if results:
                created = results[0].date
            else:
                created = modified

        graph += (
            (self.graph_name, NS.rdf.type, NS.sd.Graph),
            (self.graph_name, NS.dcterms.modified, modified),
            (self.graph_name, NS.dcterms.created, created),
        )

        logger.debug("About to serialize")

        output = transform_manager('rdf')
        with open(output, 'w') as f:
            graph.serialize(f)

        logger.debug("Serialization done; about to upload")

        uploader = Uploader()
        uploader.upload(stores=(transform_manager.store, ),
                        graph_name=self.graph_name,
                        filename=output,
                        method=self.method,
                        mimetype='application/rdf+xml')

        logger.debug("Upload complete")

        transform_manager.end([self.graph_name])
        transform_manager.touched_graph(self.graph_name)
Exemple #14
0
def upload_dataset_metadata(sender, store, graphs, when, **kwargs):
    if store.slug != DEFAULT_STORE_SLUG:
        return

    if not graphs:
        logger.debug("No graphs updated for %r; aborting", store.slug)
        return

    if not getattr(settings, 'CKAN_API_KEY', None):
        logger.debug("No CKAN_API_KEY setting, not doing anything.")
        return

    client = ckanclient.CkanClient(api_key=settings.CKAN_API_KEY)

    endpoint = Endpoint(settings.ENDPOINT_QUERY)
    query = _dataset_query % '      \n'.join('(%s)' % rdflib.URIRef(g).n3() for g in graphs)
    graph = endpoint.query(query)

    datasets = list(graph.subjects(NS.rdf.type, NS.void.Dataset))
    if len(datasets) != 1:
        logger.debug("Expected one dataset; got %d", len(datasets))
        return
    dataset = Resource(datasets[0], graph, endpoint)

    find = functools.partial(_find, graph, dataset._identifier)

    patterns = settings.CKAN_PATTERNS

    package_name = find('skos:notation', HUMFREY.theDataHubDatasetName)
    if not package_name:
        return
        #package_name = patterns.get('name', '%s') % slug

    package_title = patterns.get('title', '%s') % dataset.label

    author = find('dcterms:publisher/foaf:name|rdfs:label|dc:title|skos:prefLabel|dcterms:title')
    if author:
        author = patterns.get('author', '%s') % author

    description = find('rdfs:comment|dcterms:description',
                       (NS.xtypes['Fragment-Markdown'],
                        NS.xtypes['Fragment-PlainText'],
                        None))

    maintainer = find('oo:contact/foaf:name|rdfs:label|dc:title|skos:prefLabel|dcterms:title')
    if maintainer:
        maintainer = patterns.get('maintainer', '%s') % maintainer

    maintainer_email = find('oo:contact/foaf:mbox|v:email')
    if maintainer_email:
        maintainer_email = maintainer_email.replace('mailto:', '')

    license = find('dcterms:license|cc:license')
    if license:
        license = _licenses.get(unicode(license))

    sparql_endpoint = find('void:sparqlEndpoint')
    if sparql_endpoint:
        sparql_endpoint = unicode(sparql_endpoint)
    else:
        sparql_endpoint = 'http:' + reverse_full('data', 'sparql:endpoint')

    tags = find('humfrey:theDataHubDatasetTag', all=True)
    groups = find('humfrey:theDataHubDatasetGroup', all=True)

    url = doc_forward(dataset.uri)

    logger.debug("Fetching existing record for %r", package_name)
    try:
        package_entity = client.package_entity_get(package_name)
        logger.debug("Record successfully retrieved")
    except ckanclient.CkanApiNotFoundError:
        package_entity = {'name': package_name}
        client.package_register_post(package_entity)
        logger.debug("No record found; starting from empty")
    original = copy.deepcopy(package_entity)

    package_entity.update({'name': package_name,
                           'title': package_title,
                           'url': url,
                           'notes': description,
                           'license_id': license,
                           'author': author,
                           'maintainer': maintainer,
                           'maintainer_email': dataset.oo_contact.get_one_of('foaf:mbox', 'v:email').replace('mailto:', '', 1)})

    package_entity['groups'] = list(settings.CKAN_GROUPS
                                  | set(package_entity.get('groups', ()))
                                  | groups)
    package_entity['tags'] = list(settings.CKAN_TAGS
                                | set(package_entity.get('tags', ()))
                                | tags)

    resources = collections.defaultdict(dict, ((r.get('name'), r) for r in package_entity.get('resources', ())))

    resources['SPARQL endpoint'].update({'name': 'SPARQL endpoint',
                                         'format': 'api/sparql',
                                         'url': sparql_endpoint})

    package_entity['resources'] = resources.values()

    logger.debug("Updated CKAN record")

    if original != package_entity:
        logger.info("Updating %r at thedatahub.org", package_name)
        client.package_entity_put(package_entity)
Exemple #15
0
 def __init__(self, store, dataset, notation, updated):
     self.store = store
     self.dataset = dataset
     self.notation = notation
     self.updated = updated.replace(microsecond=0)
     self.endpoint = Endpoint(store.query_endpoint)