def teardown(self):
     '''Nose runs this method after each test method in our test class.'''
     # Rebuild CKAN's database after each test method, so that each test
     # method runs with a clean slate.
     model.repo.rebuild_db()
     search.index_for('Package').clear()
     search.rebuild()
Ejemplo n.º 2
0
def send_task(name, args, **opts):
    res_dict = args[1]
    metadata = get_metadata(res_dict)
    metadata.last_format = res_dict['format']
    metadata.last_url = res_dict['url']
    metadata.last_extracted = datetime.datetime.now()
    metadata.task_id = None
    metadata.meta.update(METADATA)
    metadata.save()
    pkg_dict = helpers.call_action('package_show', id=res_dict['package_id'])
    index_for('package').update_dict(pkg_dict)
Ejemplo n.º 3
0
def send_task(name, args, **opts):
    res_dict = args[1]
    metadata = get_metadata(res_dict)
    metadata.last_format = res_dict['format']
    metadata.last_url = res_dict['url']
    metadata.last_extracted = datetime.datetime.now()
    metadata.task_id = None
    metadata.meta.update(METADATA)
    metadata.save()
    pkg_dict = helpers.call_action('package_show', id=res_dict['package_id'])
    index_for('package').update_dict(pkg_dict)
Ejemplo n.º 4
0
 def test_index_clear(self):
     pkg_dict = {
         'id': u'penguin-id',
         'title': u'penguin',
         'state': u'active'
     }
     search.dispatch_by_operation('Package', pkg_dict, 'new')
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 1, len(response)
     search.index_for('Package').clear()
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 0
 def setup_class(cls):
     '''Nose runs this method once to setup our test class.'''
     # Test code should use CKAN's plugins.load() function to load plugins
     # to be tested.
     ckan.plugins.load('oaipmh_repository')
     
     model.repo.rebuild_db()
     search.index_for('Package').clear()
     search.rebuild()
     
     Converters().converters_dict = {}
     Converters().set_converter(TestOAIDCConverter())
 def test_index_clear(self):
     pkg_dict = {
         'id': u'penguin-id',
         'title': u'penguin',
         'state': u'active',
         'metadata_created': datetime.now().isoformat(),
         'metadata_modified': datetime.now().isoformat(),
     }
     search.dispatch_by_operation('Package', pkg_dict, 'new')
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 1, len(response)
     search.index_for('Package').clear()
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 0
Ejemplo n.º 7
0
def extract(ini_path, res_dict):
    """
    Download resource, extract and store metadata.

    The extracted metadata is stored in the database.

    Note that this task does check whether the resource exists in the
    database, whether the resource's format is indexed or whether there
    is an existing task working on the resource's metadata. This is the
    responsibility of the caller.

    The task does check which metadata fields are configured to be
    indexed and only stores those in the database.

    Any previously stored metadata for the resource is cleared.
    """
    load_config(ini_path)
    try:
        metadata = ResourceMetadata.one(resource_id=res_dict['id'])
    except NoResultFound:
        metadata = ResourceMetadata.create(resource_id=res_dict['id'])
    try:
        metadata.last_url = res_dict['url']
        metadata.last_format = res_dict['format']
        metadata.last_extracted = datetime.datetime.now()
        metadata.meta.clear()
        extracted = download_and_extract(res_dict['url'])
        for plugin in PluginImplementations(IExtractorPostprocessor):
            plugin.extractor_after_extract(res_dict, extracted)
        for key, value in extracted.iteritems():
            if is_field_indexed(key):
                metadata.meta[key] = value
    finally:
        metadata.task_id = None
        metadata.save()

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_save(res_dict, metadata.as_dict())

    # We need to update the search index for the package here. Note that
    # we cannot rely on the automatic update that happens when a resource
    # is changed, since our extraction task runs asynchronously and may
    # be finished only when the automatic index update has already run.
    pkg_dict = toolkit.get_action('package_show')({}, {
        'id': res_dict['package_id']
    })
    index_for('package').update_dict(pkg_dict)

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_index(res_dict, metadata.as_dict())
Ejemplo n.º 8
0
 def test_index_clear(self):
     pkg_dict = {
         'id': u'penguin-id',
         'title': u'penguin',
         'state': u'active',
         'metadata_created': datetime.now().isoformat(),
         'metadata_modified': datetime.now().isoformat(),
     }
     search.dispatch_by_operation('Package', pkg_dict, 'new')
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 1, len(response)
     search.index_for('Package').clear()
     response = self.solr.query('title:penguin', fq=self.fq)
     assert len(response) == 0
def enqueue_job(name, args, **opts):
    res_dict = args[1]
    try:
        metadata = get_metadata(res_dict)
    except NoResultFound:
        metadata = ResourceMetadata.create(resource_id=res_dict['id'])
    metadata.last_format = res_dict['format']
    metadata.last_url = res_dict['url']
    metadata.last_extracted = datetime.datetime.now()
    metadata.task_id = None
    metadata.meta.update(METADATA)
    metadata.save()
    pkg_dict = helpers.call_action('package_show', id=res_dict['package_id'])
    index_for('package').update_dict(pkg_dict)
    return mock.Mock(id=None)
Ejemplo n.º 10
0
    def test_search_geographies(self, app):

        # clear and rebuild the index
        package_index = search.index_for(model.Package)
        package_index.clear()
        search.rebuild()

        expected = []
        for key, geo in self.geogs.items():
            expected.extend([geo.gis_name, geo.pcode])

        data_dicts = [{'q': term} for term in expected]
        context = {'ignore_auth': True}
        for data_dict in data_dicts:
            packages = toolkit.get_action('package_search')(context, data_dict)

            # Check responses
            from_gis2 = [
                self.unrelated['20DEU010004'].pcode,
                self.unrelated['20DEU010004'].gis_name
            ]
            if data_dict['q'] in from_gis2:
                should_be = self.gis_dataset2['id']
            else:
                should_be = self.gis_dataset1['id']

            assert should_be in [result['id'] for result in packages['results']]
Ejemplo n.º 11
0
def update(ini_path, resource_dict):
    try:
        package_dict = toolkit.get_action('package_show')(
            {
                'validate': False
            }, {
                'id': resource_dict['package_id']
            })
    except toolkit.NotAuthorized:
        log.debug(('Not indexing resource {} since it belongs to the ' +
                   'private dataset {}.').format(resource_dict['id'],
                                                 resource_dict['package_id']))
        return

    for resource in package_dict.get('resources', []):
        if resource['id'] == resource_dict['id']:
            resource_dict = resource
            break

    try:
        index_info = ResourceIndexInfo.one(resource_id=resource_dict['id'])
    except NoResultFound:
        index_info = ResourceIndexInfo.create(resource_id=resource_dict['id'])

    try:
        solr = Solr('record')
        solr.delete(resource_dict['id'])

        index_info.indexed = datetime.datetime.now()
        base_record = create_base_record(package_dict, resource_dict)

        if resource_dict['format'] == 'GeoJSON':
            records = geojson_to_records(resource_dict['url'], base_record)
        else:
            records = table_to_records(resource_dict['url'], base_record)

        solr.store(records)

    except RequestException as e:
        log.warn('Failed to download resource data from "{}": {}'.format(
            resource_dict['url'], e.message))
    finally:
        index_info.task_id = None
        index_info.save()

    index_for('package').update_dict(package_dict)
Ejemplo n.º 12
0
def package_create(context, data_dict):
    pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict)
    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': False}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    index.index_package(pkg_dict)
    return pkg_dict1
Ejemplo n.º 13
0
def package_update(context, data_dict):
    '''
    Updates the dataset.

    Extends ckan's similar method to instantly re-index the SOLR index.
    Otherwise the changes would only be added during a re-index (a rebuild of search index,
    to be specific).

    :type context: dict
    :param context: context
    :type data_dict: dict
    :param data_dict: dataset as dictionary

    :rtype: dictionary
    '''
    # Get all resources here since we get only 'dataset' resources from WUI.
    package_context = {'model': model, 'ignore_auth': True, 'validate': True,
                       'extras_as_string': True}

    user = model.User.get(context['user'])
    if not user.name == "harvest":
        _remove_extras_from_data_dict(data_dict)

    package_data = package_show(package_context, data_dict)

    if not 'resources' in data_dict:
        # When this is reached, we are updating a dataset, not creating a new resource
        old_resources = package_data.get('resources', [])
        data_dict['resources'] = old_resources
        data_dict = utils.dataset_to_resource(data_dict)
    else:
        data_dict['accept-terms'] = 'yes'  # This is not needed when adding a resource

    _handle_pids(data_dict)

    _add_ida_download_url(data_dict)

    if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'):
        context['schema'] = Schemas.private_package_schema()

    data_dict.pop('persist_schema', False)

    if package_data.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_update_package_schema()

    pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict)

    # Logging for production use
    _log_action('Package', 'update', context['user'], data_dict['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': True}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    # update_dict calls index_package, so it would basically be the same
    index.update_dict(pkg_dict)

    return pkg_dict1
Ejemplo n.º 14
0
 def test_index_clear(self):
     pkg_dict = {
         "id": u"penguin-id",
         "title": u"penguin",
         "state": u"active",
         "private": False,
         "owner_org": None,
         "metadata_created": datetime.now().isoformat(),
         "metadata_modified": datetime.now().isoformat(),
     }
     search.dispatch_by_operation("Package", pkg_dict, "new")
     response = self.solr.query("title:penguin", fq=self.fq)
     assert len(response) == 1, len(response)
     search.index_for("Package").clear()
     response = self.solr.query("title:penguin", fq=self.fq)
     assert len(response) == 0
     # clear whilst empty
     search.index_for("Package").clear()
     response = self.solr.query("title:penguin", fq=self.fq)
     assert len(response) == 0
Ejemplo n.º 15
0
def extract(ini_path, res_dict):
    """
    Download resource, extract and store metadata.

    The extracted metadata is stored in the database.

    Note that this task does check whether the resource exists in the
    database, whether the resource's format is indexed or whether there
    is an existing task working on the resource's metadata. This is the
    responsibility of the caller.

    The task does check which metadata fields are configured to be
    indexed and only stores those in the database.

    Any previously stored metadata for the resource is cleared.
    """
    load_config(ini_path)
    try:
        metadata = ResourceMetadata.one(resource_id=res_dict['id'])
    except NoResultFound:
        metadata = ResourceMetadata.create(resource_id=res_dict['id'])
    try:
        metadata.last_url = res_dict['url']
        metadata.last_format = res_dict['format']
        metadata.last_extracted = datetime.datetime.now()
        metadata.meta.clear()
        extracted = download_and_extract(res_dict['url'])
        for key, value in extracted.iteritems():
            if is_field_indexed(key):
                metadata.meta[key] = value
    finally:
        metadata.task_id = None
        metadata.save()

    # We need to update the search index for the package here. Note that
    # we cannot rely on the automatic update that happens when a resource
    # is changed, since our extraction task runs asynchronously and may
    # be finished only when the automatic index update has already run.
    pkg_dict = toolkit.get_action('package_show')(
            {}, {'id': res_dict['package_id']})
    index_for('package').update_dict(pkg_dict)
Ejemplo n.º 16
0
    def setup_class(cls):
        '''
        Set up test class
        '''
        super(TestSearchDataset, cls).setup_class()

        package_index = search.index_for(model.Package)
        package_index.clear()

        data_dict = copy.deepcopy(cls.TEST_DATADICT)  # Create public dataset

        # Create a dataset for this test class
        output = cls.api_user_sysadmin.call_action('package_create', data_dict=data_dict)

        cls.package_id = output.get('id')
Ejemplo n.º 17
0
    def setup_class(cls):
        '''
        Set up test class
        '''
        super(TestSearchDataset, cls).setup_class()

        package_index = search.index_for(model.Package)
        package_index.clear()

        data_dict = copy.deepcopy(cls.TEST_DATADICT)  # Create public dataset

        # Create a dataset for this test class
        output = cls.api_user_sysadmin.call_action('package_create',
                                                   data_dict=data_dict)

        cls.package_id = output.get('id')
Ejemplo n.º 18
0
def package_create(context, data_dict):
    """
    Creates a new dataset.

    Extends ckan's similar method to instantly reindex the SOLR index,
    so that this newly added package emerges in search results instantly instead of
    during the next timed reindexing.

    :param context: context
    :param data_dict: data dictionary (package data)

    :rtype: dictionary
    """
    user = model.User.get(context['user'])
    if data_dict.get('type') == 'harvest' and not user.sysadmin:
        ckan.lib.base.abort(401, _('Unauthorized to add a harvest source'))

    if not user.name == "harvest":
        _remove_extras_from_data_dict(data_dict)

    data_dict = utils.dataset_to_resource(data_dict)

    if not user.name == 'harvest':
        _handle_package_id_on_create(data_dict)
    _handle_pids(data_dict)

    _add_ida_download_url(data_dict)
    
    if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'):
        context['schema'] = Schemas.private_package_schema()

    data_dict.pop('persist_schema', False)

    if data_dict.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_create_package_schema()

    pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict)

    # Logging for production use
    _log_action('Package', 'create', context['user'], pkg_dict1['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': False}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    index.index_package(pkg_dict)
    return pkg_dict1
Ejemplo n.º 19
0
def package_create(context, data_dict):
    """
    Creates a new dataset.

    Extends ckan's similar method to instantly reindex the SOLR index, 
    so that this newly added package emerges in search results instantly instead of 
    during the next timed reindexing.

    :param context: context
    :param data_dict: data dictionary (package data)

    :rtype: dictionary
    """
    user = model.User.get(context['user'])
    try:
        if data_dict['type'] == 'harvest' and not user.sysadmin:
            ckan.lib.base.abort(401, _('Unauthorized to add a harvest source'))

    except KeyError:
        log.debug("Tried to check the package type, but it wasn't present!")
        # TODO: JUHO: Dubious to let pass without checking user.sysadmin
        pass

    data_dict = utils.dataset_to_resource(data_dict)

    _handle_pids(context, data_dict)

    _add_ida_download_url(context, data_dict)
    if data_dict.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_create_package_schema()

    pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict)

    # Logging for production use
    _log_action('Package', 'create', context['user'], pkg_dict1['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': False}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    index.index_package(pkg_dict)
    return pkg_dict1
Ejemplo n.º 20
0
def package_create(context, data_dict):
    """
    Creates a new dataset.

    Extends ckan's similar method to instantly reindex the SOLR index,
    so that this newly added package emerges in search results instantly instead of
    during the next timed reindexing.

    :param context: context
    :param data_dict: data dictionary (package data)

    :rtype: dictionary
    """
    user = model.User.get(context['user'])
    if data_dict.get('type') == 'harvest' and not user.sysadmin:
        ckan.lib.base.abort(401, _('Unauthorized to add a harvest source'))

    data_dict = utils.dataset_to_resource(data_dict)

    _handle_pids(context, data_dict)

    _add_ida_download_url(context, data_dict)
    
    if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'):
        context['schema'] = Schemas.private_package_schema()

    data_dict.pop('persist_schema', False)

    if data_dict.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_create_package_schema()

    pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict)

    # Logging for production use
    _log_action('Package', 'create', context['user'], pkg_dict1['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': False}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    index.index_package(pkg_dict)
    return pkg_dict1
Ejemplo n.º 21
0
    def test_search_index_rebuild_sysadmin(self, app):
        user = core_factories.Sysadmin()
        data_dict = { 'q': '*:*', 'rows': 0,}
        context = { 'ignore_auth': True }

        # create a dataset
        factories.Dataset()
        package_index = search.index_for(model.Package)
        # clear the index
        package_index.clear()
        # package_search tell us there are 0 datasets
        packages = toolkit.get_action('package_search')(context, data_dict)
        assert 0 == packages['count']

        # invoke a search_index_rebuild
        env = {'REMOTE_USER': user['name'].encode('ascii')}
        app.post('/ckan-admin/search_index/rebuild', extra_environ=env, status=200)

        # now package_search will tell us there is 1 dataset
        packages = toolkit.get_action('package_search')(context, data_dict)
        assert 1 == packages['count']
Ejemplo n.º 22
0
def package_delete(context, data_dict):
    '''
    Deletes a package

    Extends ckan's similar method to instantly re-index the SOLR index.
    Otherwise the changes would only be added during a re-index (a rebuild of search index,
    to be specific).

    :param context: context
    :type context: dictionary
    :param data_dict: package data
    :type data_dict: dictionary

    '''
    # Logging for production use
    _log_action('Package', 'delete', context['user'], data_dict['id'])

    ret = ckan.logic.action.delete.package_delete(context, data_dict)
    index = index_for('package')
    index.remove_dict(data_dict)
    return ret
Ejemplo n.º 23
0
def package_delete(context, data_dict):
    '''
    Deletes a package

    Extends ckan's similar method to instantly re-index the SOLR index.
    Otherwise the changes would only be added during a re-index (a rebuild of search index,
    to be specific).

    :param context: context
    :type context: dictionary
    :param data_dict: package data
    :type data_dict: dictionary

    '''
    # Logging for production use
    _log_action('Package', 'delete', context['user'], data_dict['id'])
    
    ret = ckan.logic.action.delete.package_delete(context, data_dict)
    index = index_for('package')
    index.remove_dict(data_dict)
    return ret
Ejemplo n.º 24
0
    def before_commit(self, session):
        if not hasattr(session, '_object_cache'):
            return

        changed = session._object_cache["changed"]
        context = {"model": ckan_model}
        package_index = index_for(ckan_model.Package)

        for model_obj in set(changed):
            if not isinstance(model_obj, PackageMarsavin):
                continue
            log.debug(
                "Changed Object: {the_object}".format(the_object=model_obj))
            package_id = model_obj.package_id
            pkg_dict = toolkit.get_action('package_show')(context, {
                'id': package_id
            })
            # since we have an update on our secondary table, we want to send
            # this updated data to the search index
            log.info('Indexing just package %r...', pkg_dict['name'])
            package_index.remove_dict(pkg_dict)
            package_index.insert_dict(pkg_dict)
Ejemplo n.º 25
0
def search_index_update(context, data_dict):
    '''
    Tells CKAN to update its search index for a given package.

    This is needed because the QA value (and archiver is_broken) is added to
    the search index by other extensions (like ckanext-dgu).  TODO: Probably
    better to create a notification that another extension (like ckanext-dgu)
    can trigger it itself.
    '''
    model = context['model']
    #session = context['session']
    #user = context.get('user')
    p.toolkit.check_access('search_index_update', context, data_dict)

    pkg_dict = p.toolkit.get_action('package_show')(
        {'model': model, 'ignore_auth': True, 'validate': False,
         'use_cache': False},
        data_dict)

    indexer = index_for('package')
    indexer.update_dict(pkg_dict)

    log.info('Search index updated for: %s', pkg_dict['name'])
Ejemplo n.º 26
0
 def setup_class(cls):
     cls.search = SearchIndexCommand('search-index')
     cls.index = index_for(model.Package)
     cls.query = query_for(model.Package)
     CreateTestData.create()
Ejemplo n.º 27
0
 def teardown_class(cls):
     model.repo.rebuild_db()
     search.index_for('Package').clear()
Ejemplo n.º 28
0
 def setup_class(cls):
     cls.search = SearchIndexCommand('search-index')
     cls.index = index_for(model.Package)
     cls.query = query_for(model.Package)
     CreateTestData.create()
Ejemplo n.º 29
0
 def teardown(self):
     # clear the search index after every test
     search.index_for('Package').clear()
Ejemplo n.º 30
0
 def teardown_class(cls):
     model.repo.rebuild_db()
     cls.solr.close()
     search.index_for('Package').clear()
Ejemplo n.º 31
0
 def teardown(self):
     # clear the search index after every test
     search.index_for('Package').clear()
Ejemplo n.º 32
0
 def initial_data(self, clean_db):
     self.search = SearchIndexCommand("search-index")
     self.index = index_for(model.Package)
     self.query = query_for(model.Package)
     CreateTestData.create()
Ejemplo n.º 33
0
def package_update(context, data_dict):
    '''
    Updates the dataset.

    Extends ckan's similar method to instantly re-index the SOLR index.
    Otherwise the changes would only be added during a re-index (a rebuild of search index,
    to be specific).

    :type context: dict
    :param context: context
    :type data_dict: dict
    :param data_dict: dataset as dictionary

    :rtype: dictionary
    '''
    # Get all resources here since we get only 'dataset' resources from WUI.
    package_context = {'model': model, 'ignore_auth': True, 'validate': True,
                       'extras_as_string': True}
    package_data = package_show(package_context, data_dict)
    # package_data = ckan.logic.action.get.package_show(package_context, data_dict)

    old_resources = package_data.get('resources', [])

    if not 'resources' in data_dict:
        # When this is reached, we are updating a dataset, not creating a new resource
        data_dict['resources'] = old_resources
        data_dict = utils.dataset_to_resource(data_dict)
    else:
        data_dict['accept-terms'] = 'yes'  # This is not needed when adding a resource

    _handle_pids(context, data_dict)

    _add_ida_download_url(context, data_dict)

    # # Check if data version has changed and if so, generate a new version_PID
    # if not data_dict['version'] == temp_pkg_dict['version']:
    #     data_dict['pids'].append(
    #         {
    #             u'provider': u'kata',
    #             u'id': utils.generate_pid(),
    #             u'type': u'version',
    #         })

    if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'):
        context['schema'] = Schemas.private_package_schema()

    data_dict.pop('persist_schema', False)

    if package_data.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_update_package_schema()

    pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict)

    # Logging for production use
    _log_action('Package', 'update', context['user'], data_dict['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': True}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    # update_dict calls index_package, so it would basically be the same
    index.update_dict(pkg_dict)

    return pkg_dict1
Ejemplo n.º 34
0
 def teardown_class(cls):
     model.repo.rebuild_db()
     cls.solr.close()
     search.index_for("Package").clear()
Ejemplo n.º 35
0
def extract(ini_path, res_dict):
    """
    Download resource, extract and store metadata.

    The extracted metadata is stored in the database.

    Note that this task does check whether the resource exists in the
    database, whether the resource's format is indexed or whether there
    is an existing task working on the resource's metadata. This is the
    responsibility of the caller.

    The task does check which metadata fields are configured to be
    indexed and only stores those in the database.

    Any previously stored metadata for the resource is cleared.
    """
    load_config(ini_path)

    # Get package data before doing any hard work so that we can fail
    # early if the package is private.
    try:
        pkg_dict = toolkit.get_action('package_show')(
            {
                'validate': False
            }, {
                'id': res_dict['package_id']
            })
    except toolkit.NotAuthorized:
        log.debug(('Not extracting resource {} since it belongs to the ' +
                   'private dataset {}.').format(res_dict['id'],
                                                 res_dict['package_id']))
        return

    try:
        metadata = ResourceMetadata.one(resource_id=res_dict['id'])
    except NoResultFound:
        metadata = ResourceMetadata.create(resource_id=res_dict['id'])
    try:
        metadata.last_url = res_dict['url']
        metadata.last_format = res_dict['format']
        metadata.last_extracted = datetime.datetime.now()
        metadata.meta.clear()
        extracted = download_and_extract(res_dict['url'])
        for plugin in PluginImplementations(IExtractorPostprocessor):
            plugin.extractor_after_extract(res_dict, extracted)
        for key, value in extracted.iteritems():
            if is_field_indexed(key):
                metadata.meta[key] = value
    except RequestException as e:
        log.warn('Failed to download resource data from "{}": {}'.format(
            res_dict['url'], e.message))
    finally:
        metadata.task_id = None
        metadata.save()

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_save(res_dict, metadata.as_dict())

    # We need to update the search index for the package here. Note that
    # we cannot rely on the automatic update that happens when a resource
    # is changed, since our extraction task runs asynchronously and may
    # be finished only when the automatic index update has already run.
    index_for('package').update_dict(pkg_dict)

    for plugin in PluginImplementations(IExtractorPostprocessor):
        plugin.extractor_after_index(res_dict, metadata.as_dict())
Ejemplo n.º 36
0
def package_update(context, data_dict):
    '''
    Updates the dataset.

    Extends ckan's similar method to instantly re-index the SOLR index.
    Otherwise the changes would only be added during a re-index (a rebuild of search index,
    to be specific).

    :type context: dict
    :param context: context
    :type data_dict: dict
    :param data_dict: dataset as dictionary

    :rtype: dictionary
    '''
    # Get all resources here since we get only 'dataset' resources from WUI.
    package_context = {'model': model, 'ignore_auth': True, 'validate': True,
                    'extras_as_string': True}
    package_data = package_show(package_context, data_dict)
    # package_data = ckan.logic.action.get.package_show(package_context, data_dict)

    old_resources = package_data.get('resources', [])

    if not 'resources' in data_dict:
        # When this is reached, we are updating a dataset, not creating a new resource
        data_dict['resources'] = old_resources
        data_dict = utils.dataset_to_resource(data_dict)

    _handle_pids(context, data_dict)

    _add_ida_download_url(context, data_dict)

    # # Check if data version has changed and if so, generate a new version_PID
    # if not data_dict['version'] == temp_pkg_dict['version']:
    #     data_dict['pids'].append(
    #         {
    #             u'provider': u'kata',
    #             u'id': utils.generate_pid(),
    #             u'type': u'version',
    #         })

    # This fixes extras fields being cleared when adding a resource. This is be because the extras are not properly
    # cleared in show_package_schema conversions. Some fields stay in extras and they cause all other fields to be
    # dropped in package_update(). When updating a dataset via UI or API, the conversion to extras occur in
    # package_update() and popping extras here should have no effect.

    data_dict.pop('extras', None)
    # TODO: MIKKO: Get rid of popping extras here and rather pop the additional extras in converters so we could remove the
    # popping and the above "context['allow_partial_update'] = True" which causes the extras to be processed in a way
    # that nothing gets added to extras from the converters and everything not initially present in extras gets removed.

    # TODO: JUHO: Apply correct schema depending on dataset
    # This is quick resolution. More robust way would be to check through
    # model.Package to which harvest source the dataset belongs and then get the
    # type of the harvester (eg. DDI)
    # if data_dict['name'].startswith('FSD'):
    #     context['schema'] = schemas.update_package_schema_ddi()

    if package_data.get('type') == 'harvest':
        context['schema'] = Schemas.harvest_source_update_package_schema()

    pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict)

    # Logging for production use
    _log_action('Package', 'update', context['user'], data_dict['id'])

    context = {'model': model, 'ignore_auth': True, 'validate': False,
               'extras_as_string': True}
    pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1)
    index = index_for('package')
    # update_dict calls index_package, so it would basically be the same
    index.update_dict(pkg_dict)
    return pkg_dict1