Beispiel #1
0
 def _get_inline_preview(self, link, store=None):
     """Query RDFstore for graph and convert selected fields to JSON dictionary. """
     graph = None
     try:
         if settings.RDF_USE_LOCAL_GRAPH:
             record = RDFRecord(source_uri=link)
             if record.exists():
                 graph = record.get_graph()
             else:
                 raise UnknownGraph("unable to find {}".format(link))
         else:
             if not store:
                 store = get_rdfstore()
             store = store.get_graph_store
             named_graph = "{}/graph".format(link.rstrip('/'))
             graph = store.get(named_graph=named_graph, as_graph=True)
     except UnknownGraph as ug:
         logger.warn("Unable to find Graph for: {}".format(link))
         return None
     preview_fields = settings.EDM_API_INLINE_PREVIEW
     preview_predicates = [URIRef(pred) for pred in preview_fields.keys()]
     inline_dict = {}
     for pred, obj in graph.predicate_objects():
         if pred in preview_predicates:
             inline_dict[preview_fields[str(pred)]] = str(obj)
     if 'delving_hubId' in preview_fields.values():
         hub_id, spec = self.get_hub_id()
         inline_dict['delving_hubId'] = hub_id
     return inline_dict
Beispiel #2
0
def test__get_rdf_base_url__return_base_url_from_settings(settings):
    settings.RDF_BASE_URL = "http://testserver"
    base_url = RDFRecord.get_rdf_base_url()
    assert base_url
    assert base_url == "testserver"
    base_url = RDFRecord.get_rdf_base_url(prepend_scheme=True, scheme="https")
    assert base_url
    assert base_url == "https://testserver"
Beispiel #3
0
    def get_context_data(self, **kwargs):
        context = super(NaveDocumentTemplateView, self).get_context_data(**kwargs)
        absolute_uri = self.request.build_absolute_uri()
        target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri)

        if "detail/foldout/" in target_uri:
            slug = self.kwargs.get('slug')
            record = ElasticSearchRDFRecord(hub_id=slug)
            graph = record.get_graph_by_id(self.kwargs.get('slug'))
            if graph is not None:
                target_uri = record.source_uri
            else:
                logger.warn("Unable to find source_uri for slug: {}".format(slug))
        else:
            target_uri = RDFRecord.get_internal_rdf_base_uri(absolute_uri)
            record = ElasticSearchRDFRecord(hub_id=self.kwargs.get('slug'))
            graph = record.get_graph_by_source_uri(target_uri)
        if graph is None:
            raise UnknownGraph("URI {} is not known in our graph store".format(target_uri))
        if "/resource/cache/" in target_uri:
            target_uri = target_uri.rstrip('/')
            cache_resource = CacheResource.objects.filter(document_uri=target_uri)
            if cache_resource.exists():
                graph = cache_resource.first().get_graph()
        elif settings.RDF_USE_LOCAL_GRAPH:
            mode = self.request.REQUEST.get('mode', 'default')
            acceptance = True if mode == 'acceptance' else False
            context['acceptance'] = acceptance

        elif '/resource/aggregation' in target_uri:
            target_named_graph = "{}/graph".format(target_uri.rstrip('/'))
            graph, nr_levels = RDFModel.get_context_graph(store=rdfstore.get_rdfstore(), named_graph=target_named_graph)
        else:
            graph, nr_levels = RDFModel.get_context_graph(
                store=rdfstore.get_rdfstore(),
                target_uri=target_uri
            )
        # todo: remove: should no longer be necessary with the addition of common.middleware.ForceLangMiddleware
        language = self.request.GET.get('lang', None)
        if language:
            activate(language)
        bindings = GraphBindings(
            about_uri=target_uri,
            graph=graph,
            excluded_properties=settings.RDF_EXCLUDED_PROPERTIES
        )
        context['resources'] = bindings
        context['absolute_uri'] = RDFRecord.get_external_rdf_url(target_uri, self.request)
        for rdf_type in bindings.get_about_resource().get_types():
            search_label = rdf_type.search_label.lower()
            content_template = settings.RDF_CONTENT_FOLDOUTS.get(search_label)
            if content_template:
                self.template_name = content_template
                break

        context['points'] = RDFModel.get_geo_points(graph)

        return context
Beispiel #4
0
    def _process_action(self, action):
        try:
            self.spec = action["dataset"]
            process_verb = action["action"]
            record = None
            if process_verb in ["clear_orphans"]:
                purge_date = action.get("modification_date")
                if purge_date:
                    orphans_removed = RDFRecord.remove_orphans(spec=self.spec, timestamp=purge_date)
                    logger.info("Deleted {} orphans for {} before {}".format(orphans_removed, self.spec, purge_date))
            elif process_verb in ["disable_index"]:
                RDFRecord.delete_from_index(self.spec)
                logger.info("Deleted dataset {} from index. ".format(self.spec))
            elif process_verb in ["drop_dataset"]:
                RDFRecord.delete_from_index(self.spec)
                DataSet.objects.filter(spec=self.spec).delete()
                logger.info("Deleted dataset {} from index. ".format(self.spec))
            else:
                record_graph_uri = action["graphUri"]
                graph_ntriples = action["graph"]
                acceptance_mode = action.get("acceptanceMode", "false")
                acceptance = True if acceptance_mode is not None and acceptance_mode.lower() in ["true"] else False
                content_hash = action.get("contentHash", None)
                from lod.utils.resolver import ElasticSearchRDFRecord

                record = ElasticSearchRDFRecord(spec=self.spec, rdf_string=graph_ntriples)
                try:
                    rdf_format = record.DEFAULT_RDF_FORMAT if "<rdf:RDF" not in graph_ntriples else "xml"
                    record.from_rdf_string(
                        rdf_string=graph_ntriples, named_graph=record_graph_uri, input_format=rdf_format
                    )
                except ParseError as e:
                    self.rdf_errors.append((e, action))
                    logger.error(e, action)
                    return None
                self.records_stored += 1
                self.es_actions[(record.hub_id, content_hash)] = record.create_es_action(
                    action=process_verb,
                    store=self.store,
                    context=True,
                    flat=True,
                    exclude_fields=None,
                    acceptance=acceptance,
                    doc_type="void_edmrecord",
                    record_type="mdr",
                    content_hash=content_hash,
                )
                if settings.RDF_STORE_TRIPLES:
                    self.sparql_update_queries[(record.hub_id, content_hash)] = record.create_sparql_update_query(
                        acceptance=acceptance
                    )
            return record
        except KeyError as ke:
            self.json_errors.append((ke, action))
            self.records_with_errors += 1
            return None
Beispiel #5
0
    def identify(self):
        """Return the OAI-PMH Identify request.

        See http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
        """
        self.template_name = 'oaipmh/identify.xml'
        identify_data = {
            'name': 'OAI-PMH repository for {}'.format(settings.SITE_NAME),
            # perhaps an oai_admins method with default logic settings.admins?
            'admins': (email for name, email in settings.ADMINS),
            'earliest_date': '1990-02-01T12:00:00Z',  # placeholder
            # should probably be a class variable/configuration
            'deleted': 'no',  # no, transient, persistent (?)
            # class-level variable/configuration (may affect templates also)
            'granularity': 'YYYY-MM-DDThh:mm:ssZ',  # or YYYY-MM-DD
            # class-level config?
            'compression': 'deflate',  # gzip?  - optional
            # description - optional
            # (place-holder values from OAI docs example)
            'identifier_scheme': 'oai',
            'repository_identifier': "{}".format(RDFRecord.get_rdf_base_url(prepend_scheme=True)),
            'identifier_delimiter': '_',
            'sample_identifier': '{}_spec_localId'.format(settings.SITE_NAME)
        }
        return self.render_to_response(identify_data)
Beispiel #6
0
 def test_store_remote_cached_resource(self):
     test_uri = "http://nl.dbpedia.org/resource/Ton_Smits"
     resource = CacheResource.get_remote_lod_resource(test_uri)
     store = rdfstore._rdfstore_test
     assert len(resource) > 0
     store._clear_all()
     graph_store = store.get_graph_store
     cache_graph = "http://{}/resource/cache#graph".format(RDFRecord.get_rdf_base_url())
     self.assertFalse(
         store.ask(
             query="where {{<{}> ?p ?o}}".format(test_uri)
         ))
     response = CacheResource.store_remote_cached_resource(resource, graph_store, cache_graph)
     assert response is not None
     assert response
     self.assertTrue(
         store.ask(
             query="where {{<{}> ?p ?o}}".format(test_uri)
         )
     )
     #  cacheUrl is no longer being added
     self.assertFalse(
         store.ask(
             query="where {{<{}> <http://schemas.delving.org/nave/terms/cacheUrl> ?o}}".format(
                 test_uri)
         )
     )
Beispiel #7
0
 def generate_proxyfield_uri(self, label, language=None):
     label = label.replace(' ', '_')
     if language:
         label = "{}/{}".format(language, label)
     return "{}/resource/dataset/{}/{}".format(
             RDFRecord.get_rdf_base_url(prepend_scheme=True),
             self.spec,
             label
     )
Beispiel #8
0
    def get(self, request, *args, **kwargs):
        target_uri = os.path.splitext(request.build_absolute_uri())[0].replace('/data/', '/resource/')
        if not self.request.path.startswith("/data"):
            target_uri = re.sub('/[a-z]{2}/resource/', '/resource/', target_uri, count=1)
        if target_uri.endswith('graph'):
            target_uri = re.sub("/graph", "", target_uri)
        extension_ = self.kwargs.get('extension')

        rdf_format = mime_to_extension(get_lod_mime_type(extension_, self.request))
        if rdf_format == "rdf":
            rdf_format = "xml"

        resolved_uri = RDFRecord.get_internal_rdf_base_uri(target_uri)
        if "/resource/cache/" in target_uri:
            # old lookup rdfstore.get_rdfstore().get_cached_source_uri(target_uri)
            target_uri = target_uri.split('/resource/cache/')[-1]
            if 'geonames.org' in target_uri:
                target_uri = '{}/'.format(target_uri)
            if CacheResource.objects.filter(document_uri=target_uri).exists():
                cache_object = CacheResource.objects.filter(document_uri=target_uri).first()
                content = cache_object.get_graph().serialize(format=rdf_format)
            else:
                raise UnknownGraph("URI {} is not known in our graph store".format(target_uri))
        elif settings.RDF_USE_LOCAL_GRAPH:
            mode = self.request.REQUEST.get('mode', 'default')
            acceptance = True if mode == 'acceptance' else False
            local_object = ElasticSearchRDFRecord(source_uri=resolved_uri)
            local_object.get_graph_by_source_uri(uri=resolved_uri)
            if not local_object.exists():
                # todo: temporary work around for EDMRecords not saved with subjects
                logger.warn("Unable to find graph for: {}".format(resolved_uri))
                raise UnknownGraph("URI {} is not known in our graph store".format(resolved_uri))
            mode = self.get_mode(request)
            if mode in ['context', 'api', 'api-flat']:
                # get_graph(with_mappings=True, include_mapping_target=True, acceptance=acceptance)
                content = local_object.get_context_graph(with_mappings=True, include_mapping_target=True)
                if mode in ['api', 'api-flat']:
                    bindings = GraphBindings(about_uri=resolved_uri, graph=content)
                    index_doc = bindings.to_index_doc() if mode == 'api' else bindings.to_flat_index_doc()
                    content = json.dumps(index_doc)
                    rdf_format = 'json-ld'
                else:
                    content = content.serialize(format=rdf_format)
            else:
                content = local_object.get_graph()
                content = content.serialize(format=rdf_format)
        elif self.store.ask(uri=resolved_uri):
            target_uri = resolved_uri
            content = self.get_content(target_uri, rdf_format, request)
        return HttpResponse(
            content,
            content_type='{}; charset=utf8'.format(result_extension_to_mime(rdf_format))
        )
Beispiel #9
0
def get_resolved_uri(context, uri):
    """Returns resolved uri, or Cached URI."""
    request = context['request']
    request_base = urlparse(request.build_absolute_uri()).netloc
    rdf_base = urlparse(uri).netloc
    if request_base in settings.RDF_ROUTED_ENTRY_POINTS and rdf_base in RDFRecord.get_rdf_base_url():
        resolved_uri = uri.replace(rdf_base, request_base)
    elif rdf_base not in request_base:
        resolved_uri = get_cache_url(uri)
    else:
        return uri
    return resolved_uri
Beispiel #10
0
 def __init__(self, *args, **kwargs):
     super().__init__(*args, **kwargs)
     self.base_uri = r'{}/resource'.format(RDFRecord.get_rdf_base_url(prepend_scheme=True))
     if self.get_namespace_prefix():
         namespace_string = 'http://{}/resource/ns/{}/'.format(
             RDF_BASE_URL.replace("http://", ""),
             self.get_namespace_prefix()
         )
         self.ns = Namespace(namespace_string)
         self.rdf_type_base = Namespace("{}/{}/".format(self.base_uri, self.get_rdf_type().lower()))
         if namespace_string not in settings.RDF_SUPPORTED_NAMESPACES:
             namespace_manager.bind(self.get_namespace_prefix(), self.ns)
     self.ns_dict = dict(list(namespace_manager.namespaces()))
     self.graph = None
Beispiel #11
0
    def process_narthex_file(self, spec, store=None, acceptance=False, path=None, console=False):

        start = datetime.now()

        if not store:
            store = rdfstore.get_rdfstore()

        if not path:
            processed_fname = self.get_narthex_processed_fname()
        else:
            processed_fname = path
        print("started processing {} for dataset {}".format(processed_fname, spec))

        with open(processed_fname, 'r') as f:
            rdf_record = []
            lines = 0
            records = 0
            stored = 0
            new = 0
            not_orphaned = []
            sparql_update_queries = []
            es_actions = []
            # set orphaned records

            for line in f:
                lines += 1
                exists, named_graph, content_hash = self.is_line_marker(line)
                if exists:
                    new += 1
                    records += 1
                    triples = " ".join(rdf_record)
                    record = ElasticSearchRDFRecord(rdf_string=triples, spec=spec)
                    try:
                        record.from_rdf_string(named_graph=named_graph, rdf_string=triples, input_format="xml")
                        es_actions.append(record.create_es_action(doc_type="void_edmrecord", record_type="mdr", context=True))
                    except Exception as ex:
                        if console:
                            print("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                        else:
                            logger.error("problem with {} for spec {} caused by {}".format(triples, spec, ex))
                    rdf_record[:] = []
                    if settings.RDF_STORE_TRIPLES:
                        sparql_update_queries.append(
                            record.create_sparql_update_query(acceptance=acceptance)
                        )
                    nr_sparql_updates = len(sparql_update_queries)
                    if settings.RDF_STORE_TRIPLES and nr_sparql_updates > 0 and nr_sparql_updates % 50 == 0:
                        store.update("\n".join(sparql_update_queries))
                        sparql_update_queries[:] = []
                    if records % 100 == 0 and records > 0:
                        logger.info("processed {} records of {} at {}".format(records, spec, ctime()))
                        if console:
                            print("processed {} records of {} at {}".format(records, spec, ctime()))
                        if len(es_actions) > 100:
                            self.bulk_index(es_actions, spec)
                            es_actions[:] = []
                else:
                    rdf_record.append(line)
            # store the remaining bulk items
            self.bulk_index(es_actions, spec)
            if settings.RDF_STORE_TRIPLES and len(sparql_update_queries) > 0:
                store.update("\n".join(sparql_update_queries))
            logger.info(
                "Dataset {}: records inserted {}, records same content hash {}, lines parsed {}, total records processed {}".format(
                    spec, new, stored, lines, records)
            )
            print("Finished loading {spec} with {lines} and {records} in {seconds}\n".format(
                spec=spec,
                lines=lines,
                records=records,
                seconds=datetime.now() - start
            ))

            RDFRecord.remove_orphans(spec, start.isoformat())
            return lines, records
Beispiel #12
0
 def save(self, *args, **kwargs):
     # point to resource and not page or data
     source_uri = self.source_uri.replace('/data/', '/resource/').replace('/page/', '/resource/')
     # rewrite to base url
     self.source_uri = RDFRecord.get_internal_rdf_base_uri(source_uri)
     super(UserGeneratedContent, self).save(*args, **kwargs)
Beispiel #13
0
    def get_context_data(self, **kwargs):
        # todo later add acceptance mode
        target_uri = self.request.build_absolute_uri().replace('/page/', '/resource/')
        if "?" in target_uri:
            target_uri = re.sub("\?.*$", '', target_uri)
        # target_uri = target_uri.split('?')[:-1]
        if not self.request.path.startswith("/page"):
            target_uri = re.sub('/[a-z]{2}/resource/', '/resource/', target_uri, count=1)
        if target_uri.endswith('graph'):
            target_uri = re.sub("/graph$", "", target_uri)
        context = super(LoDHTMLView, self).get_context_data(**kwargs)

        # default and test mode
        mode = self.request.REQUEST.get('mode', 'default')
        acceptance = True if mode == 'acceptance' else False
        if not acceptance:
            acceptance = self.request.COOKIES.get('NAVE_ACCEPTANCE_MODE', False)

        object_local_cache = None

        cached = False

        context['about'] = target_uri
        context['ugc'] = None

        if "/resource/cache/" in target_uri:
            # lookup solution # rdfstore.get_rdfstore().get_cached_source_uri(target_uri)
            cached = True
            target_uri = target_uri.split('/resource/cache/')[-1]
            if target_uri.endswith("about.rdf"):
                target_uri = re.sub('about.rdf$', '', target_uri)
        else:
            target_uri = target_uri.rstrip('/')
            resolved_uri = RDFRecord.get_internal_rdf_base_uri(target_uri)
            if UserGeneratedContent.objects.filter(source_uri=resolved_uri).exists():
                context['ugc'] = UserGeneratedContent.objects.filter(source_uri=resolved_uri)
            if settings.RDF_USE_LOCAL_GRAPH:
                object_local_cache = ElasticSearchRDFRecord(source_uri=resolved_uri)
                object_local_cache.get_graph_by_source_uri(uri=resolved_uri)
                if not object_local_cache.exists():
                    context['source_uri'] = target_uri
                    context['unknown_graph'] = True
                    return context
                target_uri = resolved_uri
            elif self.store.ask(uri=resolved_uri):
                target_uri = resolved_uri

        context['source_uri'] = target_uri
        context['about_label'] = target_uri.split('/')[-1]
        context['about_spec'] = target_uri.split('/')[-2]

        context['cached'] = cached

        # special query for skos
        def is_skos():
            return self.store.ask(
                query="where {{<{subject}> <{predicate}> <{object}>}}".format(
                    subject=target_uri, predicate=RDF.type, object=SKOS.Concept))

        if object_local_cache:
            # todo: add code to retrieve proxyresources
            # (with_mappings=True, include_mapping_target=True, acceptance=acceptance)
            graph = object_local_cache.get_context_graph(with_mappings=True, include_mapping_target=True)
            nr_levels = 4
        elif cached:
            if CacheResource.objects.filter(document_uri=target_uri).exists():
                cache_object = CacheResource.objects.filter(document_uri=target_uri).first()
                graph = cache_object.get_graph()
                nr_levels = 3
            else:
                context['unknown_graph'] = True
                return context
        elif is_skos():
            graph, nr_levels = RDFModel.get_skos_context_graph(store=self.store, target_uri=target_uri)
            # nav_tree = RDFModel.get_nav_tree(target_uri=target_uri, store=self.store)
            # todo finish the nav tree implementation
            if 'skos_nav' in self.request.GET:
                return context
        elif '/resource/aggregation' in target_uri:
            target_named_graph = "{}/graph".format(target_uri.rstrip('/'))
            graph, nr_levels = RDFModel.get_context_graph(store=self.store, named_graph=target_named_graph)
        else:
            graph, nr_levels = RDFModel.get_context_graph(target_uri=target_uri, store=self.store)
        graph_contains_target = graph.query("""ASK {{ <{}> ?p ?o }} """.format(target_uri)).askAnswer

        if not graph_contains_target or len(graph) == 0:
            context['unknown_graph'] = True
            return context

        if context['about'].endswith('/'):
            context['about'] = context['about'].rstrip('/')

        context['graph'] = graph
        context['nr_levels'] = nr_levels
        context['namespaces'] = [(prefix, uri) for prefix, uri in graph.namespaces()]
        graph_bindings = GraphBindings(target_uri, graph, excluded_properties=settings.RDF_EXCLUDED_PROPERTIES)
        context['skos_links'], context['skos_filter'] = graph_bindings.get_all_skos_links()
        context['resources'] = graph_bindings
        resource = graph_bindings.get_about_resource()
        context['items'] = resource.get_items(as_tuples=True)
        rdf_type = graph_bindings.get_about_resource().get_type()
        context['rdf_type'] = rdf_type
        context['content_template'] = self.get_content_type_template(rdf_type.search_label)
        context['graph_stats'] = RDFModel.get_graph_statistics(graph)
        context['alt'] = ""
        context['points'] = RDFModel.get_geo_points(graph)
        # DEEPZOOM VALUE(S)
        zooms = graph_bindings.get_list('nave_deepZoomUrl')
        if zooms:
            context['deepzoom_count'] = len(zooms)
            context['deepzoom_urls'] = [zoom.value for zoom in zooms]
        # EXPERT MODE
        expert_mode = self.request.COOKIES.get('NAVE_DETAIL_EXPERT_MODE', False)
        if expert_mode:
            # do expert mode stuff like more like this
            context['expert_mode'] = True
            if settings.MLT_DETAIL_ENABLE and object_local_cache:
                context['data'] = {'items': object_local_cache.get_more_like_this()}
        if settings.MLT_BANNERS and isinstance(settings.MLT_BANNERS, dict) and object_local_cache:
            from collections import OrderedDict
            context['data'] = {"mlt_banners": OrderedDict()}
            for name, config in settings.MLT_BANNERS.items():
                mlt_fields = config.get("fields", None)
                if mlt_fields and any(".raw" in field for field in mlt_fields):
                    # .raw fields don't work with MORE LIKE THIS queries so are
                    # queried directly.
                    context['data']['mlt_banners'][name] = object_local_cache.get_raw_related(
                        query_fields=mlt_fields,
                        filter_query=config.get("filter_query", None),
                        graph_bindings=graph_bindings
                    )
                else:
                    context['data']['mlt_banners'][name] = object_local_cache.get_more_like_this(
                            mlt_count=10,
                            mlt_fields=mlt_fields,
                            filter_query=config.get("filter_query", None)
                        )
        view_modes = {
            'properties': "rdf/_rdf_properties.html"
        }
        display_mode = self.request.GET.get('display')
        if display_mode:
            self.template_name = view_modes.get(display_mode, self.template_name)

        return context
Beispiel #14
0
def get_external_uri(context, absolute_uri):
    return RDFRecord.get_external_rdf_url(absolute_uri, context['request'])
Beispiel #15
0
 def get_hub_id(self):
     *rest, spec, local_id = self.about_uri.split('/')
     local_id = RDFRecord.clean_local_id(local_id)
     return "{}_{}_{}".format(self.org_id, spec, local_id), spec