def teardown(self): '''Nose runs this method after each test method in our test class.''' # Rebuild CKAN's database after each test method, so that each test # method runs with a clean slate. model.repo.rebuild_db() search.index_for('Package').clear() search.rebuild()
def send_task(name, args, **opts): res_dict = args[1] metadata = get_metadata(res_dict) metadata.last_format = res_dict['format'] metadata.last_url = res_dict['url'] metadata.last_extracted = datetime.datetime.now() metadata.task_id = None metadata.meta.update(METADATA) metadata.save() pkg_dict = helpers.call_action('package_show', id=res_dict['package_id']) index_for('package').update_dict(pkg_dict)
def test_index_clear(self): pkg_dict = { 'id': u'penguin-id', 'title': u'penguin', 'state': u'active' } search.dispatch_by_operation('Package', pkg_dict, 'new') response = self.solr.query('title:penguin', fq=self.fq) assert len(response) == 1, len(response) search.index_for('Package').clear() response = self.solr.query('title:penguin', fq=self.fq) assert len(response) == 0
def setup_class(cls): '''Nose runs this method once to setup our test class.''' # Test code should use CKAN's plugins.load() function to load plugins # to be tested. ckan.plugins.load('oaipmh_repository') model.repo.rebuild_db() search.index_for('Package').clear() search.rebuild() Converters().converters_dict = {} Converters().set_converter(TestOAIDCConverter())
def test_index_clear(self): pkg_dict = { 'id': u'penguin-id', 'title': u'penguin', 'state': u'active', 'metadata_created': datetime.now().isoformat(), 'metadata_modified': datetime.now().isoformat(), } search.dispatch_by_operation('Package', pkg_dict, 'new') response = self.solr.query('title:penguin', fq=self.fq) assert len(response) == 1, len(response) search.index_for('Package').clear() response = self.solr.query('title:penguin', fq=self.fq) assert len(response) == 0
def extract(ini_path, res_dict): """ Download resource, extract and store metadata. The extracted metadata is stored in the database. Note that this task does check whether the resource exists in the database, whether the resource's format is indexed or whether there is an existing task working on the resource's metadata. This is the responsibility of the caller. The task does check which metadata fields are configured to be indexed and only stores those in the database. Any previously stored metadata for the resource is cleared. """ load_config(ini_path) try: metadata = ResourceMetadata.one(resource_id=res_dict['id']) except NoResultFound: metadata = ResourceMetadata.create(resource_id=res_dict['id']) try: metadata.last_url = res_dict['url'] metadata.last_format = res_dict['format'] metadata.last_extracted = datetime.datetime.now() metadata.meta.clear() extracted = download_and_extract(res_dict['url']) for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_extract(res_dict, extracted) for key, value in extracted.iteritems(): if is_field_indexed(key): metadata.meta[key] = value finally: metadata.task_id = None metadata.save() for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_save(res_dict, metadata.as_dict()) # We need to update the search index for the package here. Note that # we cannot rely on the automatic update that happens when a resource # is changed, since our extraction task runs asynchronously and may # be finished only when the automatic index update has already run. pkg_dict = toolkit.get_action('package_show')({}, { 'id': res_dict['package_id'] }) index_for('package').update_dict(pkg_dict) for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_index(res_dict, metadata.as_dict())
def enqueue_job(name, args, **opts): res_dict = args[1] try: metadata = get_metadata(res_dict) except NoResultFound: metadata = ResourceMetadata.create(resource_id=res_dict['id']) metadata.last_format = res_dict['format'] metadata.last_url = res_dict['url'] metadata.last_extracted = datetime.datetime.now() metadata.task_id = None metadata.meta.update(METADATA) metadata.save() pkg_dict = helpers.call_action('package_show', id=res_dict['package_id']) index_for('package').update_dict(pkg_dict) return mock.Mock(id=None)
def test_search_geographies(self, app): # clear and rebuild the index package_index = search.index_for(model.Package) package_index.clear() search.rebuild() expected = [] for key, geo in self.geogs.items(): expected.extend([geo.gis_name, geo.pcode]) data_dicts = [{'q': term} for term in expected] context = {'ignore_auth': True} for data_dict in data_dicts: packages = toolkit.get_action('package_search')(context, data_dict) # Check responses from_gis2 = [ self.unrelated['20DEU010004'].pcode, self.unrelated['20DEU010004'].gis_name ] if data_dict['q'] in from_gis2: should_be = self.gis_dataset2['id'] else: should_be = self.gis_dataset1['id'] assert should_be in [result['id'] for result in packages['results']]
def update(ini_path, resource_dict): try: package_dict = toolkit.get_action('package_show')( { 'validate': False }, { 'id': resource_dict['package_id'] }) except toolkit.NotAuthorized: log.debug(('Not indexing resource {} since it belongs to the ' + 'private dataset {}.').format(resource_dict['id'], resource_dict['package_id'])) return for resource in package_dict.get('resources', []): if resource['id'] == resource_dict['id']: resource_dict = resource break try: index_info = ResourceIndexInfo.one(resource_id=resource_dict['id']) except NoResultFound: index_info = ResourceIndexInfo.create(resource_id=resource_dict['id']) try: solr = Solr('record') solr.delete(resource_dict['id']) index_info.indexed = datetime.datetime.now() base_record = create_base_record(package_dict, resource_dict) if resource_dict['format'] == 'GeoJSON': records = geojson_to_records(resource_dict['url'], base_record) else: records = table_to_records(resource_dict['url'], base_record) solr.store(records) except RequestException as e: log.warn('Failed to download resource data from "{}": {}'.format( resource_dict['url'], e.message)) finally: index_info.task_id = None index_info.save() index_for('package').update_dict(package_dict)
def package_create(context, data_dict): pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': False} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') index.index_package(pkg_dict) return pkg_dict1
def package_update(context, data_dict): ''' Updates the dataset. Extends ckan's similar method to instantly re-index the SOLR index. Otherwise the changes would only be added during a re-index (a rebuild of search index, to be specific). :type context: dict :param context: context :type data_dict: dict :param data_dict: dataset as dictionary :rtype: dictionary ''' # Get all resources here since we get only 'dataset' resources from WUI. package_context = {'model': model, 'ignore_auth': True, 'validate': True, 'extras_as_string': True} user = model.User.get(context['user']) if not user.name == "harvest": _remove_extras_from_data_dict(data_dict) package_data = package_show(package_context, data_dict) if not 'resources' in data_dict: # When this is reached, we are updating a dataset, not creating a new resource old_resources = package_data.get('resources', []) data_dict['resources'] = old_resources data_dict = utils.dataset_to_resource(data_dict) else: data_dict['accept-terms'] = 'yes' # This is not needed when adding a resource _handle_pids(data_dict) _add_ida_download_url(data_dict) if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'): context['schema'] = Schemas.private_package_schema() data_dict.pop('persist_schema', False) if package_data.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_update_package_schema() pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict) # Logging for production use _log_action('Package', 'update', context['user'], data_dict['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': True} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') # update_dict calls index_package, so it would basically be the same index.update_dict(pkg_dict) return pkg_dict1
def test_index_clear(self): pkg_dict = { "id": u"penguin-id", "title": u"penguin", "state": u"active", "private": False, "owner_org": None, "metadata_created": datetime.now().isoformat(), "metadata_modified": datetime.now().isoformat(), } search.dispatch_by_operation("Package", pkg_dict, "new") response = self.solr.query("title:penguin", fq=self.fq) assert len(response) == 1, len(response) search.index_for("Package").clear() response = self.solr.query("title:penguin", fq=self.fq) assert len(response) == 0 # clear whilst empty search.index_for("Package").clear() response = self.solr.query("title:penguin", fq=self.fq) assert len(response) == 0
def extract(ini_path, res_dict): """ Download resource, extract and store metadata. The extracted metadata is stored in the database. Note that this task does check whether the resource exists in the database, whether the resource's format is indexed or whether there is an existing task working on the resource's metadata. This is the responsibility of the caller. The task does check which metadata fields are configured to be indexed and only stores those in the database. Any previously stored metadata for the resource is cleared. """ load_config(ini_path) try: metadata = ResourceMetadata.one(resource_id=res_dict['id']) except NoResultFound: metadata = ResourceMetadata.create(resource_id=res_dict['id']) try: metadata.last_url = res_dict['url'] metadata.last_format = res_dict['format'] metadata.last_extracted = datetime.datetime.now() metadata.meta.clear() extracted = download_and_extract(res_dict['url']) for key, value in extracted.iteritems(): if is_field_indexed(key): metadata.meta[key] = value finally: metadata.task_id = None metadata.save() # We need to update the search index for the package here. Note that # we cannot rely on the automatic update that happens when a resource # is changed, since our extraction task runs asynchronously and may # be finished only when the automatic index update has already run. pkg_dict = toolkit.get_action('package_show')( {}, {'id': res_dict['package_id']}) index_for('package').update_dict(pkg_dict)
def setup_class(cls): ''' Set up test class ''' super(TestSearchDataset, cls).setup_class() package_index = search.index_for(model.Package) package_index.clear() data_dict = copy.deepcopy(cls.TEST_DATADICT) # Create public dataset # Create a dataset for this test class output = cls.api_user_sysadmin.call_action('package_create', data_dict=data_dict) cls.package_id = output.get('id')
def package_create(context, data_dict): """ Creates a new dataset. Extends ckan's similar method to instantly reindex the SOLR index, so that this newly added package emerges in search results instantly instead of during the next timed reindexing. :param context: context :param data_dict: data dictionary (package data) :rtype: dictionary """ user = model.User.get(context['user']) if data_dict.get('type') == 'harvest' and not user.sysadmin: ckan.lib.base.abort(401, _('Unauthorized to add a harvest source')) if not user.name == "harvest": _remove_extras_from_data_dict(data_dict) data_dict = utils.dataset_to_resource(data_dict) if not user.name == 'harvest': _handle_package_id_on_create(data_dict) _handle_pids(data_dict) _add_ida_download_url(data_dict) if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'): context['schema'] = Schemas.private_package_schema() data_dict.pop('persist_schema', False) if data_dict.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_create_package_schema() pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict) # Logging for production use _log_action('Package', 'create', context['user'], pkg_dict1['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': False} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') index.index_package(pkg_dict) return pkg_dict1
def package_create(context, data_dict): """ Creates a new dataset. Extends ckan's similar method to instantly reindex the SOLR index, so that this newly added package emerges in search results instantly instead of during the next timed reindexing. :param context: context :param data_dict: data dictionary (package data) :rtype: dictionary """ user = model.User.get(context['user']) try: if data_dict['type'] == 'harvest' and not user.sysadmin: ckan.lib.base.abort(401, _('Unauthorized to add a harvest source')) except KeyError: log.debug("Tried to check the package type, but it wasn't present!") # TODO: JUHO: Dubious to let pass without checking user.sysadmin pass data_dict = utils.dataset_to_resource(data_dict) _handle_pids(context, data_dict) _add_ida_download_url(context, data_dict) if data_dict.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_create_package_schema() pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict) # Logging for production use _log_action('Package', 'create', context['user'], pkg_dict1['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': False} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') index.index_package(pkg_dict) return pkg_dict1
def package_create(context, data_dict): """ Creates a new dataset. Extends ckan's similar method to instantly reindex the SOLR index, so that this newly added package emerges in search results instantly instead of during the next timed reindexing. :param context: context :param data_dict: data dictionary (package data) :rtype: dictionary """ user = model.User.get(context['user']) if data_dict.get('type') == 'harvest' and not user.sysadmin: ckan.lib.base.abort(401, _('Unauthorized to add a harvest source')) data_dict = utils.dataset_to_resource(data_dict) _handle_pids(context, data_dict) _add_ida_download_url(context, data_dict) if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'): context['schema'] = Schemas.private_package_schema() data_dict.pop('persist_schema', False) if data_dict.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_create_package_schema() pkg_dict1 = ckan.logic.action.create.package_create(context, data_dict) # Logging for production use _log_action('Package', 'create', context['user'], pkg_dict1['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': False} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') index.index_package(pkg_dict) return pkg_dict1
def test_search_index_rebuild_sysadmin(self, app): user = core_factories.Sysadmin() data_dict = { 'q': '*:*', 'rows': 0,} context = { 'ignore_auth': True } # create a dataset factories.Dataset() package_index = search.index_for(model.Package) # clear the index package_index.clear() # package_search tell us there are 0 datasets packages = toolkit.get_action('package_search')(context, data_dict) assert 0 == packages['count'] # invoke a search_index_rebuild env = {'REMOTE_USER': user['name'].encode('ascii')} app.post('/ckan-admin/search_index/rebuild', extra_environ=env, status=200) # now package_search will tell us there is 1 dataset packages = toolkit.get_action('package_search')(context, data_dict) assert 1 == packages['count']
def package_delete(context, data_dict): ''' Deletes a package Extends ckan's similar method to instantly re-index the SOLR index. Otherwise the changes would only be added during a re-index (a rebuild of search index, to be specific). :param context: context :type context: dictionary :param data_dict: package data :type data_dict: dictionary ''' # Logging for production use _log_action('Package', 'delete', context['user'], data_dict['id']) ret = ckan.logic.action.delete.package_delete(context, data_dict) index = index_for('package') index.remove_dict(data_dict) return ret
def before_commit(self, session): if not hasattr(session, '_object_cache'): return changed = session._object_cache["changed"] context = {"model": ckan_model} package_index = index_for(ckan_model.Package) for model_obj in set(changed): if not isinstance(model_obj, PackageMarsavin): continue log.debug( "Changed Object: {the_object}".format(the_object=model_obj)) package_id = model_obj.package_id pkg_dict = toolkit.get_action('package_show')(context, { 'id': package_id }) # since we have an update on our secondary table, we want to send # this updated data to the search index log.info('Indexing just package %r...', pkg_dict['name']) package_index.remove_dict(pkg_dict) package_index.insert_dict(pkg_dict)
def search_index_update(context, data_dict): ''' Tells CKAN to update its search index for a given package. This is needed because the QA value (and archiver is_broken) is added to the search index by other extensions (like ckanext-dgu). TODO: Probably better to create a notification that another extension (like ckanext-dgu) can trigger it itself. ''' model = context['model'] #session = context['session'] #user = context.get('user') p.toolkit.check_access('search_index_update', context, data_dict) pkg_dict = p.toolkit.get_action('package_show')( {'model': model, 'ignore_auth': True, 'validate': False, 'use_cache': False}, data_dict) indexer = index_for('package') indexer.update_dict(pkg_dict) log.info('Search index updated for: %s', pkg_dict['name'])
def setup_class(cls): cls.search = SearchIndexCommand('search-index') cls.index = index_for(model.Package) cls.query = query_for(model.Package) CreateTestData.create()
def teardown_class(cls): model.repo.rebuild_db() search.index_for('Package').clear()
def teardown(self): # clear the search index after every test search.index_for('Package').clear()
def teardown_class(cls): model.repo.rebuild_db() cls.solr.close() search.index_for('Package').clear()
def initial_data(self, clean_db): self.search = SearchIndexCommand("search-index") self.index = index_for(model.Package) self.query = query_for(model.Package) CreateTestData.create()
def package_update(context, data_dict): ''' Updates the dataset. Extends ckan's similar method to instantly re-index the SOLR index. Otherwise the changes would only be added during a re-index (a rebuild of search index, to be specific). :type context: dict :param context: context :type data_dict: dict :param data_dict: dataset as dictionary :rtype: dictionary ''' # Get all resources here since we get only 'dataset' resources from WUI. package_context = {'model': model, 'ignore_auth': True, 'validate': True, 'extras_as_string': True} package_data = package_show(package_context, data_dict) # package_data = ckan.logic.action.get.package_show(package_context, data_dict) old_resources = package_data.get('resources', []) if not 'resources' in data_dict: # When this is reached, we are updating a dataset, not creating a new resource data_dict['resources'] = old_resources data_dict = utils.dataset_to_resource(data_dict) else: data_dict['accept-terms'] = 'yes' # This is not needed when adding a resource _handle_pids(context, data_dict) _add_ida_download_url(context, data_dict) # # Check if data version has changed and if so, generate a new version_PID # if not data_dict['version'] == temp_pkg_dict['version']: # data_dict['pids'].append( # { # u'provider': u'kata', # u'id': utils.generate_pid(), # u'type': u'version', # }) if asbool(data_dict.get('private')) and not data_dict.get('persist_schema'): context['schema'] = Schemas.private_package_schema() data_dict.pop('persist_schema', False) if package_data.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_update_package_schema() pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict) # Logging for production use _log_action('Package', 'update', context['user'], data_dict['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': True} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') # update_dict calls index_package, so it would basically be the same index.update_dict(pkg_dict) return pkg_dict1
def teardown_class(cls): model.repo.rebuild_db() cls.solr.close() search.index_for("Package").clear()
def extract(ini_path, res_dict): """ Download resource, extract and store metadata. The extracted metadata is stored in the database. Note that this task does check whether the resource exists in the database, whether the resource's format is indexed or whether there is an existing task working on the resource's metadata. This is the responsibility of the caller. The task does check which metadata fields are configured to be indexed and only stores those in the database. Any previously stored metadata for the resource is cleared. """ load_config(ini_path) # Get package data before doing any hard work so that we can fail # early if the package is private. try: pkg_dict = toolkit.get_action('package_show')( { 'validate': False }, { 'id': res_dict['package_id'] }) except toolkit.NotAuthorized: log.debug(('Not extracting resource {} since it belongs to the ' + 'private dataset {}.').format(res_dict['id'], res_dict['package_id'])) return try: metadata = ResourceMetadata.one(resource_id=res_dict['id']) except NoResultFound: metadata = ResourceMetadata.create(resource_id=res_dict['id']) try: metadata.last_url = res_dict['url'] metadata.last_format = res_dict['format'] metadata.last_extracted = datetime.datetime.now() metadata.meta.clear() extracted = download_and_extract(res_dict['url']) for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_extract(res_dict, extracted) for key, value in extracted.iteritems(): if is_field_indexed(key): metadata.meta[key] = value except RequestException as e: log.warn('Failed to download resource data from "{}": {}'.format( res_dict['url'], e.message)) finally: metadata.task_id = None metadata.save() for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_save(res_dict, metadata.as_dict()) # We need to update the search index for the package here. Note that # we cannot rely on the automatic update that happens when a resource # is changed, since our extraction task runs asynchronously and may # be finished only when the automatic index update has already run. index_for('package').update_dict(pkg_dict) for plugin in PluginImplementations(IExtractorPostprocessor): plugin.extractor_after_index(res_dict, metadata.as_dict())
def package_update(context, data_dict): ''' Updates the dataset. Extends ckan's similar method to instantly re-index the SOLR index. Otherwise the changes would only be added during a re-index (a rebuild of search index, to be specific). :type context: dict :param context: context :type data_dict: dict :param data_dict: dataset as dictionary :rtype: dictionary ''' # Get all resources here since we get only 'dataset' resources from WUI. package_context = {'model': model, 'ignore_auth': True, 'validate': True, 'extras_as_string': True} package_data = package_show(package_context, data_dict) # package_data = ckan.logic.action.get.package_show(package_context, data_dict) old_resources = package_data.get('resources', []) if not 'resources' in data_dict: # When this is reached, we are updating a dataset, not creating a new resource data_dict['resources'] = old_resources data_dict = utils.dataset_to_resource(data_dict) _handle_pids(context, data_dict) _add_ida_download_url(context, data_dict) # # Check if data version has changed and if so, generate a new version_PID # if not data_dict['version'] == temp_pkg_dict['version']: # data_dict['pids'].append( # { # u'provider': u'kata', # u'id': utils.generate_pid(), # u'type': u'version', # }) # This fixes extras fields being cleared when adding a resource. This is be because the extras are not properly # cleared in show_package_schema conversions. Some fields stay in extras and they cause all other fields to be # dropped in package_update(). When updating a dataset via UI or API, the conversion to extras occur in # package_update() and popping extras here should have no effect. data_dict.pop('extras', None) # TODO: MIKKO: Get rid of popping extras here and rather pop the additional extras in converters so we could remove the # popping and the above "context['allow_partial_update'] = True" which causes the extras to be processed in a way # that nothing gets added to extras from the converters and everything not initially present in extras gets removed. # TODO: JUHO: Apply correct schema depending on dataset # This is quick resolution. More robust way would be to check through # model.Package to which harvest source the dataset belongs and then get the # type of the harvester (eg. DDI) # if data_dict['name'].startswith('FSD'): # context['schema'] = schemas.update_package_schema_ddi() if package_data.get('type') == 'harvest': context['schema'] = Schemas.harvest_source_update_package_schema() pkg_dict1 = ckan.logic.action.update.package_update(context, data_dict) # Logging for production use _log_action('Package', 'update', context['user'], data_dict['id']) context = {'model': model, 'ignore_auth': True, 'validate': False, 'extras_as_string': True} pkg_dict = ckan.logic.action.get.package_show(context, pkg_dict1) index = index_for('package') # update_dict calls index_package, so it would basically be the same index.update_dict(pkg_dict) return pkg_dict1