コード例 #1
0
def determine_version_filter(version=None,
                             resource_ids=None,
                             resource_ids_and_versions=None):
    '''
    Determine and return the elasticsearch-dsl filter which can filter on the version extracted from
    the given parameters.

    :param version: the version to filter on across all resources
    :param resource_ids: the resource to search
    :param resource_ids_and_versions: a dict of resource ids -> versions providing resource specific
                                      versions for search
    :return: an elasticsearch-dsl object
    '''
    if not resource_ids_and_versions:
        # default the version to now if necessary
        if version is None:
            version = to_timestamp(datetime.now())
        # just use a single version filter if we don't have any resource specific versions
        return create_version_query(version)
    else:
        # run through the resource specific versions provided and ensure they're rounded down
        indexes_and_versions = {}
        for resource_id in resource_ids:
            target_version = resource_ids_and_versions[resource_id]
            if target_version is None:
                raise toolkit.ValidationError(
                    u"Valid version not given for {}".format(resource_id))
            index = prefix_resource(resource_id)
            rounded_version = common.SEARCH_HELPER.get_rounded_versions(
                [index], target_version)[index]
            indexes_and_versions[index] = rounded_version

        return create_index_specific_version_filter(indexes_and_versions)
コード例 #2
0
def datastore_delete(resource_id, context, version=None):
    '''
    Deletes the resource from the datastore. In reality the resource data is maintained in its index
    but the latest version of all records is set to an empty record. This means that the old data is
    still accessible to ensure searches using versions before the deletion still work but searches
    at the latest version or later will return no records. The deletion work is done by an rq
    background job and therefore this is an async action.

    :param resource_id: the id of the resource to delete
    :param context: the context dict from the action call
    :param version: the to mark the deletion at
    :return: a dict containing info about the background job that is doing the delete
    '''
    # if the requested deletion version is missing, default to now
    if version is None:
        version = to_timestamp(datetime.now())

    if is_resource_read_only(resource_id):
        raise toolkit.ValidationError(
            u'This resource has been marked as read only')

    # queue the job
    resource = toolkit.get_action(u'resource_show')(context, {
        u'id': resource_id
    })
    job = queue_deletion(resource, version)
    return {
        u'queued_at': job.enqueued_at.isoformat(),
        u'job_id': job.id,
    }
コード例 #3
0
def test_to_timestamp():
    # create a UTC timezone class so that we don't have to use any external libs just for this test
    class UTC(tzinfo):
        def utcoffset(self, dt):
            return timedelta(0)

        def tzname(self, dt):
            return u'UTC'

        def dst(self, dt):
            return timedelta(0)

    utc = UTC()

    # check that dates are treated as utc
    assert to_timestamp(
        datetime.strptime(u'19700101', u'%Y%m%d').replace(tzinfo=utc)) == 0
    # check a later date too
    assert to_timestamp(
        datetime.strptime(u'20180713',
                          u'%Y%m%d').replace(tzinfo=utc)) == 1531440000000
コード例 #4
0
def datastore_count(resource_ids=None, version=None):
    if version is None:
        version = to_timestamp(datetime.now())
    if resource_ids is None:
        resource_ids = [u'*']

    indexes = [
        get_public_alias_name(resource_id) for resource_id in resource_ids
    ]
    search = Search(using=common.ES_CLIENT,
                    index=indexes).filter(create_version_query(version))

    return search.count()
コード例 #5
0
    def notify(self, entity, operation):
        '''
        Respond to changes to model objects. We use this hook to ensure any new data is imported
        into the versioned datastore and to make sure the privacy settings on the data are up to
        date. We're only interested in:

            - resource deletions
            - new resources
            - resources that have had changes to their URL
            - packages that have changed

        :param entity: the entity that has changed
        :param operation: the operation undertaken on the object. This will be one of the options
                          from the DomainObjectOperation enum.
        '''
        if isinstance(
                entity,
                model.Package) and operation == DomainObjectOperation.changed:
            # if a package is the target entity and it's been changed ensure the privacy is applied
            # correctly to its resource indexes
            update_resources_privacy(entity)
        elif isinstance(entity, model.Resource):
            context = {u'model': model, u'ignore_auth': True}
            data_dict = {u'resource_id': entity.id}

            if operation == DomainObjectOperation.deleted:
                toolkit.get_action(u'datastore_delete')(context, data_dict)
            else:
                do_upsert = False

                if operation == DomainObjectOperation.new:
                    # datastore_create returns True when the resource looks like it's ingestible
                    do_upsert = toolkit.get_action(u'datastore_create')(
                        context, data_dict)
                elif operation == DomainObjectOperation.changed:
                    # always try the upsert if the resource has changed
                    do_upsert = True

                if do_upsert:
                    # use the revision version as the version
                    data_dict[u'version'] = to_timestamp(
                        entity.revision.timestamp)
                    # use replace to overwrite the existing data (this is what users would expect)
                    data_dict[u'replace'] = True
                    try:
                        toolkit.get_action(u'datastore_upsert')(context,
                                                                data_dict)
                    except (ReadOnlyResourceException,
                            InvalidVersionException):
                        # this is fine, just swallow
                        pass
コード例 #6
0
def datastore_upsert(resource_id,
                     replace,
                     context,
                     original_data_dict,
                     version=None):
    '''
    Main data ingestion function for the datastore. The URL on the given resource will be used to
    retrieve and then ingest data or, if provided, records will be ingested directly from the
    request. Data is ingested using the an rq background job and therefore this is an async action.

    :param resource_id: the resource to ingest the data into
    :param replace: whether to replace the data already in the resource or append to it
    :param context: the context dict from the action call
    :param original_data_dict: the data_dict before it was validated
    :param version: the version of the new data, can be None (default) but if not must be newer
                    than the latest version of the resource
    :return: information about the background job that is handling the ingestion
    '''
    # this comes through as junk if it's not removed before validating. This happens because the
    # data dict is flattened during validation, but why this happens is unclear.
    records = original_data_dict.get(u'records', None)

    if is_resource_read_only(resource_id):
        raise ReadOnlyResourceException(
            u'This resource has been marked as read only')

    if version is None:
        version = to_timestamp(datetime.now())

    # check that the version is valid
    if not check_version_is_valid(resource_id, version):
        raise InvalidVersionException(
            u'The new version must be newer than current version')

    # get the current user
    user = toolkit.get_action(u'user_show')(context, {u'id': context[u'user']})

    # queue the resource import job
    resource = toolkit.get_action(u'resource_show')(context, {
        u'id': resource_id
    })
    job = queue_import(resource, version, replace, records, user[u'apikey'])

    return {
        u'queued_at': job.enqueued_at.isoformat(),
        u'job_id': job.id,
    }
コード例 #7
0
def datastore_queue_download(email_address,
                             context,
                             query=None,
                             query_version=None,
                             version=None,
                             resource_ids=None,
                             resource_ids_and_versions=None,
                             separate_files=True,
                             format=u'csv',
                             ignore_empty_fields=True):
    '''
    Starts a download of the data found by the given query parameters. This download is created
    asynchronously using the rq background job queue and a link to the results is emailed to the
    given email address when complete.

    :param email_address: the email address to send the download link to
    :param context: the context dict from the action call
    :param query: the query dict. If None (default) then an empty query is used
    :param query_version: the version of the query schema the query is using. If None (default) then
                          the latest query schema version is used
    :param version: the version to search the data at. If None (default) the current time is used
    :param resource_ids: the list of resource to search. If None (default) then all the resources
                         the user has access to are queried. If a list of resources are passed then
                         any resources not accessible to the user will be removed before querying
    :param resource_ids_and_versions: a dict of resources and versions to search each of them at.
                                      This allows precise searching of each resource at a specific
                                      parameter. If None (default) then the resource_ids parameter
                                      is used together with the version parameter. If this parameter
                                      is provided though, it takes priority over the resource_ids
                                      and version parameters.
    :param separate_files: whether to split the results into a file per resource or just put all
                           results in one file. The default is True - split results into a file per
                           resource.
    :param format: the format to download the data in. The default is csv.
    :param ignore_empty_fields: whether to ignore fields with no data in them in the result set
                                and not write them into the download file(s). Default: True.
    :return: a dict containing info about the background job that is doing the downloading and the
             download id
    '''
    if resource_ids_and_versions is None:
        resource_ids_and_versions = {}
    else:
        # use the resource_ids_and_versions dict first over the resource_ids and version params
        resource_ids = list(resource_ids_and_versions.keys())

    # figure out which resources should be searched
    resource_ids = get_available_datastore_resources(context, resource_ids)
    if not resource_ids:
        raise toolkit.ValidationError(
            u"The requested resources aren't accessible to this user")

    rounded_resource_ids_and_versions = {}
    # see if a version was provided, we'll use this is a resource id we're searching doesn't have a
    # directly assigned version (i.e. it was absent from the requested_resource_ids_and_versions
    # dict, or that parameter wasn't provided)
    if version is None:
        version = to_timestamp(datetime.now())
    for resource_id in resource_ids:
        # try to get the target version from the passed resource_ids_and_versions dict, but if
        # it's not in there, default to the version variable
        target_version = resource_ids_and_versions.get(resource_id, version)
        index = prefix_resource(resource_id)
        # round the version down to ensure we search the exact version requested
        rounded_version = common.SEARCH_HELPER.get_rounded_versions(
            [index], target_version)[index]
        if rounded_version is not None:
            # resource ids without a rounded version are skipped
            rounded_resource_ids_and_versions[resource_id] = rounded_version

    # setup the query
    if query is None:
        query = {}
    if query_version is None:
        query_version = get_latest_query_version()
    validate_query(query, query_version)
    search = translate_query(query, query_version)
    query_hash = hash_query(query, query_version)

    options = {
        u'separate_files': separate_files,
        u'format': format,
        u'ignore_empty_fields': ignore_empty_fields
    }
    download = DatastoreDownload(
        query_hash=query_hash,
        query=query,
        query_version=query_version,
        resource_ids_and_versions=rounded_resource_ids_and_versions,
        state=u'queued',
        options=options)
    download.save()

    job = queue_download(email_address,
                         download.id, query_hash, query, query_version,
                         search.to_dict(), rounded_resource_ids_and_versions,
                         separate_files, format, ignore_empty_fields)

    return {
        u'queued_at': job.enqueued_at.isoformat(),
        u'job_id': job.id,
        u'download_id': download.id,
    }
コード例 #8
0
def datastore_guess_fields(context, query=None, query_version=None, version=None, resource_ids=None,
                           resource_ids_and_versions=None, size=10, ignore_groups=None):
    '''
    Guesses the fields that are most relevant to show with the given query.

    If only one resource is included in the search then the requested number of fields from the
    resource at the required version are returned in ingest order if the details are available.

    If multiple resources are queried, the most common fields across the resource under search are
    returned. The fields are grouped together in an attempt to match the same field name in
    different cases across different resources. The most common {size} groups are returned.

    The groups returned are ordered firstly by the number of resources they appear in in descending
    order, then if there are ties, the number of records the group finds is used and this again is
    ordered in a descending fashion.

    :param context: the context dict from the action call
    :param query: the query
    :param query_version: the query schema version
    :param version: the version to search at
    :param resource_ids: the resource ids to search in
    :param resource_ids_and_versions: a dict of resource ids -> versions to search at
    :param size: the number of groups to return
    :param ignore_groups: a list of groups to ignore from the results (default: None)
    :return: a list of groups
    '''
    # provide some more complex defaults for some parameters if necessary
    if query is None:
        query = {}
    if query_version is None:
        query_version = get_latest_query_version()
    ignore_groups = set(g.lower() for g in ignore_groups) if ignore_groups is not None else set()

    try:
        # validate and translate the query into an elasticsearch-dsl Search object
        validate_query(query, query_version)
        search = translate_query(query, query_version)
    except (jsonschema.ValidationError, InvalidQuerySchemaVersionError) as e:
        raise toolkit.ValidationError(e.message)

    # figure out which resources we're searching
    resource_ids, skipped_resource_ids = determine_resources_to_search(context, resource_ids,
                                                                       resource_ids_and_versions)
    if not resource_ids:
        raise toolkit.ValidationError(u"The requested resources aren't accessible to this user")

    if version is None:
        version = to_timestamp(datetime.now())
    # add the version filter necessary given the parameters and the resources we're searching
    version_filter = determine_version_filter(version, resource_ids, resource_ids_and_versions)
    search = search.filter(version_filter)

    # add the size parameter, we don't want any records back
    search = search.extra(size=0)

    resource_ids = find_searched_resources(search, resource_ids)

    all_fields = get_all_fields(resource_ids)
    for group in ignore_groups:
        all_fields.ignore(group)

    # allow plugins to modify the fields object
    for plugin in PluginImplementations(IVersionedDatastore):
        all_fields = plugin.datastore_modify_guess_fields(resource_ids, all_fields)

    if len(resource_ids) == 1:
        resource_id = resource_ids[0]
        if resource_ids_and_versions is None:
            up_to_version = version
        else:
            up_to_version = resource_ids_and_versions[resource_id]
        return get_single_resource_fields(all_fields, resource_id, up_to_version, search, size)
    else:
        size = max(0, min(size, 25))
        return select_fields(all_fields, search, size)
コード例 #9
0
def datastore_search(context, data_dict, original_data_dict):
    '''
    Searches the datastore using a query schema similar to the standard CKAN datastore query schema,
    but with versioning.

    :param context: the context dict from the action call
    :param data_dict: the data_dict from the action call
    :param original_data_dict: the data_dict before it was validated
    :return: a dict including the search results amongst other things
    '''
    original_data_dict, data_dict, version, search = create_search(
        context, data_dict, original_data_dict)
    resource_id = data_dict[u'resource_id']
    index_name = prefix_resource(resource_id)

    # if the version is None, default it to the current timestamp
    if version is None:
        version = to_timestamp(datetime.now())

    # add the version filter to the query
    search = search.filter(create_version_query(version))

    # if the run query option is false (default to true if not present) then just return the query
    # we would have run against elasticsearch instead of actually running it. This is useful for
    # running the query outside of ckan, for example on a tile server.
    if not data_dict.get(u'run_query', True):
        return {
            u'indexes': [index_name],
            u'search': search.to_dict(),
        }
    else:
        result = run_search(search, [index_name])

        # allow other extensions implementing our interface to modify the result
        for plugin in PluginImplementations(IVersionedDatastore):
            result = plugin.datastore_modify_result(context,
                                                    original_data_dict,
                                                    data_dict, result)

        # add the actual result object to the context in case the caller is an extension and they
        # have used one of the interface hooks to alter the search object and include, for example,
        # an aggregation
        context[u'versioned_datastore_query_result'] = result

        # get the fields
        mapping, fields = get_fields(resource_id, version)
        # allow other extensions implementing our interface to modify the field definitions
        for plugin in PluginImplementations(IVersionedDatastore):
            fields = plugin.datastore_modify_fields(resource_id, mapping,
                                                    fields)

        query_for_logging = {}
        for key in _query_log_keys:
            if data_dict.get(key, None):
                query_for_logging[key] = data_dict[key]
        log_query(query_for_logging, u'basicsearch')

        # return a dictionary containing the results and other details
        return {
            u'total':
            result.hits.total,
            u'records': [hit.data.to_dict() for hit in result],
            u'facets':
            format_facets(result.aggs.to_dict()),
            u'fields':
            fields,
            u'raw_fields':
            mapping[u'mappings'][DOC_TYPE][u'properties'][u'data']
            [u'properties'],
            u'after':
            get_last_after(result.hits),
            u'_backend':
            u'versioned-datastore',
        }
コード例 #10
0
def datastore_search_raw(resource_id,
                         context,
                         data_dict,
                         original_data_dict,
                         search=None,
                         version=None,
                         raw_result=False,
                         include_version=True):
    '''
    Searches the datastore using a raw elasticsearch query.

    :param resource_id: the id of the resource to search
    :param context: the context dict from the action call
    :param data_dict: the data_dict from the action call
    :param original_data_dict: the data_dict before it was validated
    :param search: the elasticsearch query to run
    :param version: the version of the data to query against
    :param raw_result: whether to return the result as a raw elasticsearch result, or format it in
                       the same way as a normal datastore_search call would
    :param include_version: whether to include the version in the search or not
    :return: a dict containing the results of the search
    '''
    if search is None:
        search = {}
    if version is None:
        version = to_timestamp(datetime.now())
    index_name = prefix_resource(resource_id)
    search = Search.from_dict(search)

    try:
        # the user has asked for a raw result and that the version filter is not included
        if raw_result and not include_version:
            version = None

        # run the query passing the version which will either be the requested version, the current
        # timestamp or None if no version filter should be included in the search
        result = run_search(search, index_name, version)

        if raw_result:
            return result.to_dict()

        # allow other extensions implementing our interface to modify the result object
        for plugin in PluginImplementations(IVersionedDatastore):
            result = plugin.datastore_modify_result(context,
                                                    original_data_dict,
                                                    data_dict, result)

        # add the actual result object to the context in case the caller is an extension and
        # they have used one of the interface hooks to alter the search object and include, for
        # example, an aggregation
        context[u'versioned_datastore_query_result'] = result

        # get the fields
        mapping, fields = get_fields(resource_id, version)
        # allow other extensions implementing our interface to modify the field definitions
        for plugin in PluginImplementations(IVersionedDatastore):
            fields = plugin.datastore_modify_fields(resource_id, mapping,
                                                    fields)

        # return a dictionary containing the results and other details
        return {
            u'total':
            result.hits.total,
            u'records': [hit.data.to_dict() for hit in result],
            u'facets':
            format_facets(result.aggs.to_dict()),
            u'fields':
            fields,
            u'raw_fields':
            mapping[u'mappings'][DOC_TYPE][u'properties'][u'data']
            [u'properties'],
            u'after':
            get_last_after(result.hits),
            u'_backend':
            u'versioned-datastore',
        }
    except RequestError as e:
        raise toolkit.ValidationError(str(e))