def _bulk_update_dataset(context, data_dict, update_dict): ''' Bulk update shared code for organizations''' datasets = data_dict.get('datasets', []) org_id = data_dict.get('org_id') model = context['model'] model.Session.query(model.package_table) \ .filter(model.Package.id.in_(datasets)) \ .filter(model.Package.owner_org == org_id) \ .update(update_dict, synchronize_session=False) # revisions model.Session.query(model.package_table) \ .filter(model.Package.id.in_(datasets)) \ .filter(model.Package.owner_org == org_id) \ .update(update_dict, synchronize_session=False) model.Session.commit() # solr update here psi = search.PackageSearchIndex() # update the solr index in batches BATCH_SIZE = 50 def process_solr(q): # update the solr index for the query query = search.PackageSearchQuery() q = { 'q': q, 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': BATCH_SIZE } for result in query.run(q)['results']: data_dict = json.loads(result['data_dict']) if data_dict['owner_org'] == org_id: data_dict.update(update_dict) psi.index_package(data_dict, defer_commit=True) count = 0 q = [] for id in datasets: q.append('id:"%s"' % (id)) count += 1 if count % BATCH_SIZE == 0: process_solr(' OR '.join(q)) q = [] if len(q): process_solr(' OR '.join(q)) # finally commit the changes psi.commit()
def set_resource_metadata(update_dict): ''' Set appropriate datastore_active flag on CKAN resource. Called after creation or deletion of DataStore table. ''' from ckan import model # We're modifying the resource extra directly here to avoid a # race condition, see issue #3245 for details and plan for a # better fix update_dict.update({ 'datastore_active': update_dict.get('datastore_active', True), 'datastore_contains_all_records_of_source_file': update_dict.get('datastore_contains_all_records_of_source_file', True) }) # get extras(for entity update) and package_id(for search index update) res_query = model.Session.query( model.resource_table.c.extras, model.resource_table.c.package_id ).filter( model.Resource.id == update_dict['resource_id'] ) extras, package_id = res_query.one() # update extras in database for record and its revision extras.update(update_dict) res_query.update({'extras': extras}, synchronize_session=False) if hasattr(model, 'resource_revision_table'): model.Session.query(model.resource_revision_table).filter( model.ResourceRevision.id == update_dict['resource_id'], model.ResourceRevision.current is True ).update({'extras': extras}, synchronize_session=False) model.Session.commit() # get package with updated resource from solr # find changed resource, patch it and reindex package psi = search.PackageSearchIndex() solr_query = search.PackageSearchQuery() q = { 'q': 'id:"{0}"'.format(package_id), 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': 1 } for record in solr_query.run(q)['results']: solr_data_dict = json.loads(record['data_dict']) for resource in solr_data_dict['resources']: if resource['id'] == update_dict['resource_id']: resource.update(update_dict) psi.index_package(solr_data_dict) break
def setup_class(cls): cls.solr_client = search.make_connection() cls.fq = ' +site_id:"%s" ' % config["ckan.site_id"] cls.package_index = search.PackageSearchIndex() cls.base_package_dict = { "id": "test-index", "name": "monkey", "title": "Monkey", "state": "active", "private": False, "type": "dataset", "owner_org": None, "metadata_created": datetime.datetime.now().isoformat(), "metadata_modified": datetime.datetime.now().isoformat(), }
def update_solr_package_indexes(package_dict): # Updating Solr Index if package_dict: log.debug("::: UPDATING SOLR INDEX :::") # solr update here psi = search.PackageSearchIndex() # update the solr index in batches BATCH_SIZE = 50 def process_solr(q): # update the solr index for the query query = search.PackageSearchQuery() q = { 'q': q, 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': BATCH_SIZE } for result in query.run(q)['results']: data_dict = json.loads(result['data_dict']) if data_dict['owner_org'] == package_dict.get('owner_org'): psi.index_package(data_dict, defer_commit=True) count = 0 q = [] q.append('id:"%s"' % (package_dict.get('id'))) count += 1 if count % BATCH_SIZE == 0: process_solr(' OR '.join(q)) q = [] if len(q): process_solr(' OR '.join(q)) # finally commit the changes psi.commit() else: log.warning( "::: package_dict is None: SOLR INDEX CANNOT BE UPDATED! :::")
def set_datastore_active_flag(model, data_dict, flag): ''' Set appropriate datastore_active flag on CKAN resource. Called after creation or deletion of DataStore table. ''' update_dict = {'datastore_active': flag} # get extras(for entity update) and package_id(for search index update) res_query = model.Session.query( model.resource_table.c.extras, model.resource_table.c.package_id).filter( model.Resource.id == data_dict['resource_id']) extras, package_id = res_query.one() # update extras in database for record and its revision extras.update(update_dict) res_query.update({'extras': extras}, synchronize_session=False) model.Session.query(model.resource_revision_table).filter( model.ResourceRevision.id == data_dict['resource_id'], model.ResourceRevision.current is True).update( {'extras': extras}, synchronize_session=False) model.Session.commit() # get package with updated resource from solr # find changed resource, patch it and reindex package psi = search.PackageSearchIndex() solr_query = search.PackageSearchQuery() q = { 'q': 'id:"{0}"'.format(package_id), 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': 1 } for record in solr_query.run(q)['results']: solr_data_dict = json.loads(record['data_dict']) for resource in solr_data_dict['resources']: if resource['id'] == data_dict['resource_id']: resource.update(update_dict) psi.index_package(solr_data_dict) break
def set_datastore_active_flag( context: Context, data_dict: dict[str, Any], flag: bool): ''' Set appropriate datastore_active flag on CKAN resource. Called after creation or deletion of DataStore table. ''' # We're modifying the resource extra directly here to avoid a # race condition, see issue #3245 for details and plan for a # better fix model = context['model'] update_dict = {'datastore_active': flag} # get extras(for entity update) and package_id(for search index update) res_query = model.Session.query( model.resource_table.c.extras, model.resource_table.c.package_id ).filter( model.Resource.id == data_dict['resource_id'] ) extras, package_id = res_query.one() # update extras in database for record extras.update(update_dict) res_query.update({'extras': extras}, synchronize_session=False) model.Session.commit() # get package with updated resource from package_show # find changed resource, patch it and reindex package psi = search.PackageSearchIndex() try: _data_dict = p.toolkit.get_action('package_show')(context, { 'id': package_id }) for resource in _data_dict['resources']: if resource['id'] == data_dict['resource_id']: resource.update(update_dict) psi.index_package(_data_dict) break except (logic.NotAuthorized, logic.NotFound) as e: log.error(e.message)
def setup_class(cls): if not search.is_available(): raise nose.SkipTest('Solr not reachable') cls.solr_client = search.make_connection() cls.fq = " +site_id:\"%s\" " % config['ckan.site_id'] cls.package_index = search.PackageSearchIndex() cls.base_package_dict = { 'id': 'test-index', 'name': 'monkey', 'title': 'Monkey', 'state': 'active', 'private': False, 'type': 'dataset', 'owner_org': None, 'metadata_created': datetime.datetime.now().isoformat(), 'metadata_modified': datetime.datetime.now().isoformat(), }
if existing_tag: TagMultilang.persist( { 'id': existing_tag.id, 'name': tag_name, 'text': tag_localized_name }, tag_lang) log.info( '::::::::: OBJECT TAG PERSISTED SUCCESSFULLY :::::::::' ) # Updating Solr Index if package_dict: log.info("::: UPDATING SOLR INDEX :::") # solr update here psi = search.PackageSearchIndex() # update the solr index in batches BATCH_SIZE = 50 def process_solr(q): # update the solr index for the query query = search.PackageSearchQuery() q = { 'q': q, 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': BATCH_SIZE }
def datastore_create(context, data_dict): '''Adds a new table to the DataStore. The datastore_create action allows you to post JSON data to be stored against a resource. This endpoint also supports altering tables, aliases and indexes and bulk insertion. This endpoint can be called multiple times to initially insert more data, add fields, change the aliases or indexes as well as the primary keys. To create an empty datastore resource and a CKAN resource at the same time, provide ``resource`` with a valid ``package_id`` and omit the ``resource_id``. If you want to create a datastore resource from the content of a file, provide ``resource`` with a valid ``url``. See :ref:`fields` and :ref:`records` for details on how to lay out records. :param resource_id: resource id that the data is going to be stored against. :type resource_id: string :param force: set to True to edit a read-only resource :type force: bool (optional, default: False) :param resource: resource dictionary that is passed to :meth:`~ckan.logic.action.create.resource_create`. Use instead of ``resource_id`` (optional) :type resource: dictionary :param aliases: names for read only aliases of the resource. (optional) :type aliases: list or comma separated string :param fields: fields/columns and their extra metadata. (optional) :type fields: list of dictionaries :param records: the data, eg: [{"dob": "2005", "some_stuff": ["a", "b"]}] (optional) :type records: list of dictionaries :param primary_key: fields that represent a unique key (optional) :type primary_key: list or comma separated string :param indexes: indexes on table (optional) :type indexes: list or comma separated string Please note that setting the ``aliases``, ``indexes`` or ``primary_key`` replaces the exising aliases or constraints. Setting ``records`` appends the provided records to the resource. **Results:** :returns: The newly created data object. :rtype: dictionary See :ref:`fields` and :ref:`records` for details on how to lay out records. ''' schema = context.get('schema', dsschema.datastore_create_schema()) records = data_dict.pop('records', None) resource = data_dict.pop('resource', None) data_dict, errors = _validate(data_dict, schema, context) resource_dict = None if records: data_dict['records'] = records if resource: data_dict['resource'] = resource if errors: raise p.toolkit.ValidationError(errors) p.toolkit.check_access('datastore_ts_create', context, data_dict) if 'resource' in data_dict and 'resource_id' in data_dict: raise p.toolkit.ValidationError({ 'resource': ['resource cannot be used with resource_id'] }) if not 'resource' in data_dict and not 'resource_id' in data_dict: raise p.toolkit.ValidationError({ 'resource_id': ['resource_id or resource required'] }) if 'resource' in data_dict: has_url = 'url' in data_dict['resource'] if 'retention' in data_dict['resource']: try: retention = int(data_dict['resource']['retention']) if retention < 1 or retention > 100: raise Exception() except: raise p.toolkit.ValidationError({'resource': [ 'Retention must be an integer from 1-100']}) # A datastore only resource does not have a url in the db data_dict['resource'].setdefault('url', '_datastore_only_resource') resource_dict = p.toolkit.get_action('resource_create')( context, data_dict['resource']) data_dict['resource_id'] = resource_dict['id'] # create resource from file if has_url: if not p.plugin_loaded('datapusher'): raise p.toolkit.ValidationError({'resource': [ 'The datapusher has to be enabled.']}) p.toolkit.get_action('datapusher_submit')(context, { 'resource_id': resource_dict['id'], 'set_url_type': True }) # since we'll overwrite the datastore resource anyway, we # don't need to create it here return # create empty resource else: # no need to set the full url because it will be set in before_show resource_dict['url_type'] = 'datastore' p.toolkit.get_action('resource_update')(context, resource_dict) else: if not data_dict.pop('force', False): resource_id = data_dict['resource_id'] _check_read_only(context, resource_id) data_dict['connection_url'] = config['ckan.datastore.write_url'] # validate aliases aliases = datastore_helpers.get_list(data_dict.get('aliases', [])) for alias in aliases: if not db._is_valid_table_name(alias): raise p.toolkit.ValidationError({ 'alias': [u'"{0}" is not a valid alias name'.format(alias)] }) # create a private datastore resource, if necessary model = _get_or_bust(context, 'model') resource = model.Resource.get(data_dict['resource_id']) legacy_mode = 'ckan.datastore.read_url' not in config if not legacy_mode and resource.package.private: data_dict['private'] = True try: result = db.create(context, data_dict) except db.InvalidDataError as err: raise p.toolkit.ValidationError(unicode(err)) # Set the datastore_active flag on the resource if necessary if resource.extras.get('datastore_active') is not True: log.debug( 'Setting datastore_active=True on resource {0}'.format(resource.id) ) # issue #3245: race condition update_dict = {'datastore_active': True} # get extras(for entity update) and package_id(for search index update) res_query = model.Session.query( model.resource_table.c.extras, model.resource_table.c.package_id ).filter( model.Resource.id == data_dict['resource_id'] ) extras, package_id = res_query.one() # update extras in database for record and its revision extras.update(update_dict) res_query.update({'extras': extras}, synchronize_session=False) model.Session.query(model.resource_revision_table).filter( model.ResourceRevision.id == data_dict['resource_id'], model.ResourceRevision.current is True ).update({'extras': extras}, synchronize_session=False) model.Session.commit() # get package with updated resource from solr # find changed resource, patch it and reindex package psi = search.PackageSearchIndex() solr_query = search.PackageSearchQuery() q = { 'q': 'id:"{0}"'.format(package_id), 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id'), 'rows': 1 } for record in solr_query.run(q)['results']: solr_data_dict = json.loads(record['data_dict']) for resource in solr_data_dict['resources']: if resource['id'] == data_dict['resource_id']: resource.update(update_dict) psi.index_package(solr_data_dict) break result.pop('id', None) result.pop('private', None) result.pop('connection_url') datastore_helpers.remove_autogen(result) return result
def _bulk_update_dataset(context: Context, data_dict: DataDict, update_dict: dict[str, Any]): ''' Bulk update shared code for organizations''' datasets = data_dict.get('datasets', []) org_id = data_dict.get('org_id') model = context['model'] model.Session.query(model.package_table) \ .filter( # type_ignore_reason: incomplete SQLAlchemy types model.Package.id.in_(datasets) # type: ignore ) .filter(model.Package.owner_org == org_id) \ .update(update_dict, synchronize_session=False) # Handle Activity Stream for Bulk Operations user = context['user'] user_obj = model.User.by_name(user) if user_obj: user_id = user_obj.id else: user_id = 'not logged in' for dataset in datasets: entity = model.Package.get(dataset) assert entity activity = entity.activity_stream_item('changed', user_id) model.Session.add(activity) model.Session.commit() # solr update here psi = search.PackageSearchIndex() # update the solr index in batches BATCH_SIZE = 50 def process_solr(q: str): # update the solr index for the query query = search.PackageSearchQuery() q_dict = { 'q': q, 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get_value('ckan.site_id'), 'rows': BATCH_SIZE } for result in query.run(q_dict)['results']: data_dict = json.loads(result['data_dict']) if data_dict['owner_org'] == org_id: data_dict.update(update_dict) psi.index_package(data_dict, defer_commit=True) count = 0 q = [] for id in datasets: q.append('id:"%s"' % (id)) count += 1 if count % BATCH_SIZE == 0: process_solr(' OR '.join(q)) q = [] if len(q): process_solr(' OR '.join(q)) # finally commit the changes psi.commit()
def __init__(self, name=None): self.name = 'privatedatasets' self.indexer = search.PackageSearchIndex()
def __init__(self, name=None): self.indexer = search.PackageSearchIndex()
def __init__(self, name=None): self.name = 'dcatAmsterdam' self.indexer = search.PackageSearchIndex()
def update_product_geo(context, data_dict): # noinspection PyUnresolvedReferences """ Update the specificgeocode_bi_txtm value and sets the geo level (geolevel_*) accordingly. :param productId: ID of the product to update. :type productId: str :param dguids: Geo-code values status code :type dguids: list of strings :return: updated package :rtype: dict """ product_id = _get_or_bust(data_dict, 'productId') dguids = _get_or_bust(data_dict, 'dguids') lc = ckanapi.LocalCKAN(context=context) if isinstance(dguids, basestring): dguids = [x.strip() for x in dguids.split(';')] for dguid in dguids: if not stcndm_helpers.is_dguid(dguid): _msg = u'Expecting dguid, got {code} instead'.format(code=dguid) raise _ValidationError({u'geodescriptor_codes': _msg}) response = lc.action.package_search( q='product_id_new:{product_id}'.format( product_id=product_id ) ) if response['count'] == 0: raise _ValidationError(('Record not found.',)) elif response['count'] > 1: raise _ValidationError( ('More than one record identified with these values. ' 'Please contact CKAN IT',) ) pkg_dict = response['results'][0] old_geolevel_codes = pkg_dict.get(u'geolevel_codes', []) pkg_dict['geolevel_codes'] = list( set(stcndm_helpers.get_geolevel(sc) for sc in dguids) ) if pkg_dict['product_type_code'] in VALID_DATA_TYPES: # Data product geodescriptors (for which there can be tens of # thousands) are stored using the geodescriptor service instead of # directly on the package. geo.clear_geodescriptors_for_package(pkg_dict['product_id_new']) for geo_code in dguids: geo.update_relationship(pkg_dict['product_id_new'], geo_code) else: # Non-data products simply have the geodescriptors assigned to the # package. pkg_dict['geodescriptor_codes'] = dguids if old_geolevel_codes == pkg_dict.get(u'geolevel_codes', []): # force the re-index of the package so dguids make it into solr query = search.PackageSearchQuery() q = { 'q': 'id:{id}'.format(id=pkg_dict['id']), 'fl': 'data_dict', 'wt': 'json', 'fq': 'site_id:"%s"' % config.get('ckan.site_id') } pkg_to_index = json.loads(query.run(q)['results'][0]['data_dict']) psi = search.PackageSearchIndex() psi.index_package(pkg_to_index) else: # update the package geolevels lc.action.package_update(**pkg_dict) return lc.action.package_show(id=pkg_dict['id'])