def validation_get_schema(dataset_type, resource_type): schema = scheming_get_dataset_schema(dataset_type) for resource in schema.get('resources', []): if resource.get("resource_type", "") == resource_type: for field in resource.get('resource_fields', []): if field['field_name'] == "schema": return validation_load_json_schema(field['field_value'])
def get_dataset_schema(context, data_dict): """ Returns full schema definition for the dataset `name`. :param name: The name of the schema to return. :param expanded: Expand schema presets. Defaults to `True`. :returns: A complete dataset schema or 404 if not found. :rtype: dict """ schema_name = _get_or_bust(data_dict, 'name') expanded = data_dict.get('expanded', True) if isinstance(expanded, basestring): expanded = True if expanded == 'true' else False result = scheming_helpers.scheming_get_dataset_schema( schema_name, expanded=expanded ) if result is None: raise _NotFound(('no schema by the name {name}'.format( name=schema_name ),)) return result
def resource_copy(self, id, resource_id): context = {'model': model, 'user': toolkit.c.user} # Check access try: toolkit.check_access('package_update', context, {'id': id}) except toolkit.NotAuthorized: message = 'Not authorized to copy resource of dataset "%s"' return toolkit.abort(403, message % id) # Get resource try: resource = toolkit.get_action('resource_show')(context, { 'id': resource_id }) except (toolkit.NotAuthorized, toolkit.ObjectNotFound): message = 'Not found resource "%s" of dataset "%s"' return toolkit.abort(404, message % (resource_id, id)) # Extract data data = {} schema = scheming_get_dataset_schema('dataset') for field in schema['resource_fields']: # We skip url field (current file) if field['field_name'] == 'url': continue # We skip autogenerated fields if field.get('form_snippet', True) is None: continue if field['field_name'] in resource: data[field['field_name']] = resource[field['field_name']] data['name'] = '%s (copy)' % resource.get('name') return self.new_resource(id, data=data)
def _merge_with_schema_default_values(package_type, resource_type, data_dict): """ This function merges the file uploader default resource with the default values specified in the ckanext-schemining schema. It allows us to bulk upload multiple copies ofa particular resource type e.g. multiple spectrum files. """ # If no package_type or resource_type we can't do this. if not (package_type and resource_type): return data_dict schema = scheming_get_dataset_schema(package_type) resource_schemas = schema.get("resource_schemas", {}) resource_schema = resource_schemas.get(resource_type, {}) file_name = data_dict['name'] # Step through each field and merge in the default value if it exits. for field in resource_schema.get('resource_fields', []): if field['field_name'] == 'restricted': # TODO: Would be nice if restricted didn't need special treatment data_dict["restricted_allowed_users"] = field.get( 'default_users', "") data_dict["restricted_allowed_orgs"] = field.get( 'default_organizations', "") value = field.get('default', field.get('field_value')) if value: data_dict[field['field_name']] = value # Multiple resources with the same name is confusing, so merge in filename data_dict['name'] = "{}: {}".format(data_dict.get('name', ""), file_name) return data_dict
def spc_thematic_area_list(context, data_dict): tk.check_access('spc_thematic_area_list', context, data_dict) schema = scheming_helpers.scheming_get_dataset_schema('dataset') field = scheming_helpers.scheming_field_by_name(schema['dataset_fields'], 'thematic_area_string') choices = scheming_helpers.scheming_field_choices(field) return choices
def _get_facets_title_with_translation(self): ''' Get the translated facet title ''' # name of additional facets additional_facets_name = {} # stop if the facet list is empty if not self.additional_facets: return additional_facets_name # get current environment's language language = additional_facets_helpers.lang() # search and get the translated title for facet for facet in self.additional_facets: if self.DATASET_FIELD in facet: # if 'facet_name' and 'dataset_type' exist, wins the 'facet_name' if self.FACET_NAME_FIELD in facet: if type(facet[self.FACET_NAME_FIELD]) is dict: label_array = facet[self.FACET_NAME_FIELD] for key, value in label_array.iteritems(): if key == language and value is not None: additional_facets_name[facet[self.DATASET_FIELD]] = value else: additional_facets_name[facet[self.DATASET_FIELD]] = facet[self.FACET_NAME_FIELD] else: if facet[self.DATASET_TYPE_FIELD]: from ckanext.scheming import helpers as scheming_helpers package_type = self._get_dataset_type_of_facet(facet[self.DATASET_FIELD]) schema = scheming_helpers.scheming_get_dataset_schema(package_type) if schema is None: continue schema_name = facet[self.DATASET_FIELD] #remove prefix in facet name schema_name = schema_name.replace('extras_', '') schema_name = schema_name.replace('res_extras_', '') # switch for dataset or resource if schema_name.startswith( 'res_' ) and 'resource_fields' in schema: fields_from_schema = schema['resource_fields'] elif 'dataset_fields' in schema: fields_from_schema = schema['dataset_fields'] else: continue for field in fields_from_schema: # ckanext-scheming schemas if field['field_name'] == schema_name and 'label' in field: if type(field['label']) is dict: label_array = field['label'] for key, value in label_array.iteritems(): if key == language and value is not None: additional_facets_name[facet[self.DATASET_FIELD]] = value else: additional_facets_name[facet[self.DATASET_FIELD]] = value = field['label'] return additional_facets_name
def before_index(self, pkg_dict): # Remove internal non-indexable fields # admin_notes pkg_dict.pop('admin_notes', None) pkg_dict.pop('extras_admin_notes', None) # sampling_procedure_notes pkg_dict.pop('sampling_procedure_notes', None) pkg_dict.pop('extras_sampling_procedure_notes', None) # response_rate_notes pkg_dict.pop('response_rate_notes', None) pkg_dict.pop('extras_response_rate_notes', None) # data_collection_notes pkg_dict.pop('data_collection_notes', None) pkg_dict.pop('extras_data_collection_notes', None) # weight_notes pkg_dict.pop('weight_notes', None) pkg_dict.pop('extras_weight_notes', None) # clean_ops_notes pkg_dict.pop('clean_ops_notes', None) pkg_dict.pop('extras_clean_ops_notes', None) # data_accs_notes pkg_dict.pop('data_accs_notes', None) pkg_dict.pop('extras_data_accs_notes', None) # Index labels on selected fields schema = scheming_get_dataset_schema('dataset') fields = [ 'data_collector', 'keywords', 'sampling_procedure', 'operational_purpose_of_data', 'data_collection_technique', 'process_status', 'identifiability' ] for field in fields: if pkg_dict.get(field): value = pkg_dict[field] try: values = json.loads(pkg_dict[field]) except ValueError: values = [value] out = [] for schema_field in schema['dataset_fields']: if schema_field['field_name'] == field: for item in values: for choice in schema_field['choices']: if choice['value'] == item: out.append(choice['label']) pkg_dict['vocab_' + field] = out return pkg_dict
def datawa_scheming_select_options(field_name): schema = sh.scheming_get_dataset_schema('dataset') try: access_level_options = sh.scheming_field_by_name( schema['dataset_fields'], field_name)['choices'] options = {i['value']: i['label'] for i in access_level_options} except Exception as e: raise e return options
def get_choice_label(name, value, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) for choice in field.get('choices', []): if choice.get('value') == value: return choice.get('label') return value
def datawa_scheming_select_options(field_name): schema = sh.scheming_get_dataset_schema("dataset") try: access_level_options = sh.scheming_field_by_name( schema["dataset_fields"], field_name)["choices"] options = {i["value"]: i["label"] for i in access_level_options} except Exception as e: raise e return options
def _get_facet_item_label_with_translation(self, dataset_facet_field, default_facet_label): ''' Translate the default label of facet item. Return the default facet label if no translation available :param dataset_facet_field: the name of facet field in the dataset :param default_facet_label: the default label of the facet item ''' from ckanext.scheming import helpers as scheming_helpers package_type = self._get_dataset_type_of_facet(dataset_facet_field) schema = scheming_helpers.scheming_get_dataset_schema(package_type) # if a facet has `facet_items` and `dataset_type`, wins `facet_items` if self._get_facet_items_of_facet(dataset_facet_field, self.additional_facets) is None: # if schema exists if schema is not None: schema_name = dataset_facet_field #remove prefix in facet name schema_name = schema_name.replace('extras_', '') schema_name = schema_name.replace('res_extras_', '') # switch for dataset or resource if schema_name.startswith( 'res_' ) and 'resource_fields' in schema: fields_from_schema = schema['resource_fields'] elif 'dataset_fields' in schema: fields_from_schema = schema['dataset_fields'] else: return self._translate_facet_item_label(dataset_facet_field, default_facet_label) for field in fields_from_schema: if field['field_name'] == schema_name: #if item key is given - see facet_list.html if default_facet_label is not None: if 'choices' in field: return scheming_helpers.scheming_choices_label(field['choices'], default_facet_label) elif 'choices_helper' in field: from ckantoolkit import h choices_fn = getattr(h, field['choices_helper']) return scheming_helpers.scheming_choices_label(choices_fn(field), default_facet_label) else: return default_facet_label; else: if len(field['label']) > 1 and type(field['label']) is dict: label_array = field['label'] language = scheming_helpers.lang() for key, value in label_array.iteritems(): if key == language: if value is not None: return value else: return default_facet_label if field['label'] is not None: return field['label'] else: return default_facet_label return self._translate_facet_item_label(dataset_facet_field, default_facet_label)
def metadata_download(self, package_id): context = { 'model': model, 'session': model.Session, 'user': p.toolkit.c.user } data_dict = { 'id': package_id, } try: result = get_action('package_show')(context, data_dict) except (ObjectNotFound, NotAuthorized): abort(404, _('Package not found')) dataset_fields = helpers.scheming_get_dataset_schema( "dataset")['dataset_fields'] if hasattr(response, u'headers'): response.headers['Content-Type'] = 'text/csv' response.headers['Content-disposition'] = \ 'attachment; filename="{name}-metadata.csv"'.format(name=package_id) f = StringIO.StringIO() wr = csv.writer(f, encoding='utf-8') header = ['Field', 'Value'] wr.writerow(header) for field in dataset_fields: if field['field_name'] == 'tag_string': value = self.get_package_tags(result.get('tags')) wr.writerow( [helpers.scheming_language_text(field['label']), value]) elif field['field_name'] == 'owner_org': org_alias = str( config.get('ckan.organization_alias', 'Organization')) wr.writerow([org_alias, result['organization']['title']]) elif field['field_name'] == 'groups': group_alias = str(config.get('ckan.group_alias', 'Group')) + 's' value = self.get_package_groups(result.get('groups')) wr.writerow([group_alias, value]) elif helpers.scheming_field_choices(field): value = helpers.scheming_choices_label( helpers.scheming_field_choices(field), result.get(field['field_name'])) wr.writerow( [helpers.scheming_language_text(field['label']), value]) else: wr.writerow([ helpers.scheming_language_text(field['label']), result.get(field['field_name']) ]) return f.getvalue()
def get_field_label(name, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) if field: return field.get('label', name) else: log.warning( 'Could not get field {} from deposited-dataset schema'.format( name))
def _get_group(result): if result['type'] != 'subject': return type_schema = scheming_helpers.scheming_get_dataset_schema(result['type']) for field in type_schema['dataset_fields']: if field['field_name'] == 'subject_display_code': for choice in field['choices']: if choice['value'] == result.get('subject_display_code', '-1'): return choice['label']
def scheming_dataset_schema_show(context, data_dict): ''' Return the scheming schema for a given dataset type :param type: the dataset type ''' t = get_or_bust(data_dict, 'type') s = scheming_get_dataset_schema(t) if s is None: raise ObjectNotFound() return s
def get_resource_value_label(field_name, resource, dataset_type='dataset'): schema = scheming_get_dataset_schema(dataset_type) for field in schema['resource_fields']: if field['field_name'] == field_name: return toolkit.render_snippet( 'scheming/snippets/display_field.html', data=dict(field=field, data=resource, entity_type='dataset', object_type=dataset_type))
def get_field_choices(dataset_type): from ckanext.scheming import helpers as scheming_helpers schema = scheming_helpers.scheming_get_dataset_schema(dataset_type) fields = dict() for field in schema['dataset_fields']: if field.get('choices'): choices_new = dict() for choice in field.get('choices'): choices_new[choice['value']] = choice['label']['zh_TW'] if isinstance(choice['label'], dict) else choice['label'] fields[field['field_name']] = choices_new return fields
def _pull_facet_title_from_schema(self, package_type, name, item_name, title): schema = scheming_helpers.scheming_get_dataset_schema(package_type) language = scheming_helpers.lang() schema_name = name schema_name = schema_name.replace('res_extras_', '') # switch for dataset or resource if schema_name.startswith('res_'): fields_from_schema = schema['resource_fields'] else: fields_from_schema = schema['dataset_fields'] for field in fields_from_schema: if field['field_name'] == schema_name: #if item key is given - see facet_list.html if item_name is not None: if 'choices' in field: for entry in field['choices']: if entry['value'] == item_name: if len(entry['label']) > 1 and type( entry['label']) is dict: label_array = entry['label'] for key, value in label_array.iteritems(): if key == language: if value is not None: return value else: return title if value is not None: return value else: return title else: return title else: if len(field['label']) > 1 and type( field['label']) is dict: label_array = field['label'] for key, value in label_array.iteritems(): if key == language: if value is not None: return value else: return title if value is not None: return value else: return title if field['label'] is not None: return field['label'] else: return title return title
def scheming_dataset_schema_show(context, data_dict): """ Return the scheming schema for a given dataset type :param type: the dataset type :param expanded: True to expand presets (default) """ t = get_or_bust(data_dict, 'type') expanded = data_dict.get('expanded', True) s = scheming_get_dataset_schema(t, expanded) if s is None: raise ObjectNotFound() return s
def scheming_dataset_schema_show(context, data_dict): ''' Return the scheming schema for a given dataset type :param type: the dataset type :param expanded: True to expand presets (default) ''' t = get_or_bust(data_dict, 'type') expanded = data_dict.get('expanded', True) s = scheming_get_dataset_schema(t, expanded) if s is None: raise ObjectNotFound() return s
def get_choice_label(name, value, is_resource=False): schema = scheming_get_dataset_schema('deposited-dataset') fields = schema['resource_fields'] if is_resource else schema[ 'dataset_fields'] field = scheming_field_by_name(fields, name) if field: for choice in field.get('choices', []): if choice.get('value') == value: return choice.get('label') return value else: log.warning( 'Could not get field {} from deposited-dataset schema'.format( name))
def _map_gdl_to_publication(data_dict, obj): dataset = { "id": str(uuid.uuid3(uuid.NAMESPACE_DNS, str(data_dict['id']))), "type": "publications", "title": data_dict['title'], "creator": [a['name'] for a in data_dict['authors']], # "subject": data_dict, "notes": data_dict['description'], "publisher": data_dict.get('relatedOrganisation'), # "contributor": [a['name'] for a in data_dict['authors']], "date": data_dict.get('created'), "metadata_modified": data_dict.get('created'), # "publication_type": data_dict, # "format": data_dict, "identifier": data_dict['identifier'], "source": data_dict.get('source'), # "language": data_dict, # "relation": data_dict, # "spatial": data_dict, # "rights": data_dict, "license_id": 'notspecified', "member_countries": 'other', # relatedCountry, optional "harvest_source": 'GDL' } thematic_area = data_dict.get('thematicArea', {}).get('area') if thematic_area: dataset["thematic_area_string"] = thematic_area_mapping.get( thematic_area) related_country = data_dict.get('relatedCountry') if related_country: schema = sh.scheming_get_dataset_schema('publications') choices = sh.scheming_field_by_name(schema['dataset_fields'], 'member_countries')['choices'] member_country = F.first( F.filter( F.compose(F.rpartial(contains, related_country), itemgetter('label')), choices)) if member_country: dataset['member_countries'] = member_country['value'] spatial = get_extent_for_country(member_country['label']) if spatial: dataset['spatial'] = spatial['value'] if data_dict['file']: res_url = _gl_url(obj.source.url, 'download') + '?id=' + str( data_dict['id']) res = {'name': data_dict['file'], 'url': res_url} res['format'] = splitext(res['name'])[1].lstrip('.') dataset['resources'] = [res] return dataset
def copy(package_type, dataset_id): """ Copy a dataset """ context = {'model': model, 'user': toolkit.c.user} # Get organizations orgs = toolkit.get_action('organization_list_for_user')( context.copy(), { 'permission': 'create_dataset' }) org_ids = [org['id'] for org in orgs] # Check access if not orgs: message = 'Not authorized to copy dataset "%s"' return toolkit.abort(403, message % dataset_id) # Get dataset try: dataset = toolkit.get_action('package_show')(context.copy(), { 'id': dataset_id }) except (toolkit.NotAuthorized, toolkit.ObjectNotFound): message = 'Not found py dataset "%s"' return toolkit.abort(404, message % dataset_id) # Extract data data = {} schema = scheming_get_dataset_schema('dataset') for field in schema['dataset_fields']: # We skip name/title if field['field_name'] in ['name', 'title']: continue # We skip autogenerated fields if field.get('form_snippet', True) is None: continue # We skip empty fields if field['field_name'] not in dataset: continue data[field['field_name']] = dataset[field['field_name']] data['type'] = 'dataset' data['private'] = bool(dataset.get('private')) if data.get('owner_org'): data['owner_org'] = data['owner_org'] if data[ 'owner_org'] in org_ids else None data['original_dataset'] = dataset data['tags'] = dataset['tags'] view = CreateView() return view.get(package_type, data=data)
def new(self, ds_id, ds_type): new_payload = None if 'save' not in request.params: lc = ckanapi.LocalCKAN() pkg = lc.action.package_show(id=ds_id) pkg_id = pkg[PRODUCT_ID] parent_schema = scheming_get_dataset_schema(pkg['type']) new_payload = { 'type': ds_type, 'top_parent_id': pkg.get('top_parent_id', pkg_id) or pkg_id } id_payload = { 'parentProductId': pkg['product_id_new'], 'parentProduct': pkg['product_id_new'], 'productType': str( parent_schema['dataset_type_code'] ), 'productTypeCode': str( parent_schema['dataset_type_code'] ) } if ds_type == 'format': new_payload['parent_id'] = pkg_id elif ds_type == 'article': pass elif ('non_data_product' in parent_schema and parent_schema['non_data_product'] == True): if is_legacy_product(pkg[PRODUCT_ID]): new_payload[PRODUCT_ID] = lc.action.GetNextLegacyProductId( **id_payload ) else: id_payload['subjectCode'] = pkg['subject_codes'][0] new_payload[PRODUCT_ID] = lc.action.GetNextNonDataProductId( **id_payload ) else: new_payload[PRODUCT_ID] = lc.action.GetNextProductId( **id_payload ) return PackageController().new(new_payload)
def _normalize_dataset_dict(dataset_dict): ''' Adapt the dataset dict returned by the RDF harvester to the one expected by the custom Honduras schema ''' dataset_schema = scheming_get_dataset_schema('dataset') field_names = [f['field_name'] for f in dataset_schema['dataset_fields']] # Promote extras to root fields for name in field_names: val = _remove_dataset_dict_extra(dataset_dict, name) if val: dataset_dict[name] = val return dataset_dict
def get_dataset_fields(self): fields = model.Package.get_fields(core_only=True) scheming_schema = scheming_get_dataset_schema( 'dataset')['dataset_fields'] scheming_fields = [] for field in scheming_schema: scheming_fields.append(field['field_name'].encode('utf8')) # Remove duplicate fields, since scheming can contain fields named similarly to CKAN core fields for field in scheming_fields: if field not in fields: fields.append(field) log.info(fields) return fields
def after_search(self, search_results, search_params): facets = search_results.get('search_facets') results = search_results.get('results') if not facets or not results: return search_results schema = scheming_helpers.scheming_get_dataset_schema(results[0]['type']) for facet in facets.values(): for item in facet['items']: field_name = facet['title'].replace('_facet', '') field = scheming_helpers.scheming_field_by_name( \ schema['dataset_fields'], field_name) if field and (field.get('choices') or \ field.get('choices_helper')): choices = scheming_helpers.scheming_field_choices(field) item['display_name'] = scheming_helpers. \ scheming_choices_label(choices, item['name']) return search_results
def _pull_title_from_schema(self, package_type): language = scheming_helpers.lang() schema = scheming_helpers.scheming_get_dataset_schema(package_type) if 'dataset_type_label' in schema: if len(schema['dataset_type_label']) > 1 and type( schema['dataset_type_label']) is dict: label_array = schema['dataset_type_label'] for key, value in label_array.iteritems(): if key == language: if value is not None: return value else: return schema['dataset_type'] if value is not None: return value else: return schema['dataset_type'] else: return schema['dataset_type']
def _extract_additional_fields(self, content, package_dict): package_dict['thematic_area_string'] = self.topic if not package_dict.get('license_id'): package_dict['license_id'] = 'notspecified' skip_keys = {'set_spec', 'description'} for key, value in content.items(): if key in package_dict or key in skip_keys: continue if key == 'type': key = 'publication_type' package_dict[key] = value package_dict.pop('extras', None) package_dict['type'] = 'publications' package_dict.pop('maintainer_email', None) coverage = package_dict.pop('coverage', None) if coverage: schema = scheming_get_dataset_schema('publications') field = scheming_field_by_name(schema['dataset_fields'], 'member_countries') choices = scheming_field_choices(field) package_dict['member_countries'] = [ choice['value'] for choice in choices if choice['label'] in coverage ] or ['other'] polygons = [ t['geometry'] for t in eez.collection if any(country in t['properties']['GeoName'] for country in coverage) ] # TODO: for now we are taking first polygon from possible # list because of SOLR restriction of spatial field # size. In future we may add additional logic here if polygons: package_dict['coverage'] = json.dumps(polygons[0]) return package_dict
def new(self, ds_id, ds_type): new_payload = None if 'save' not in request.params: lc = ckanapi.LocalCKAN() pkg = lc.action.package_show(id=ds_id) pkg_id = pkg[PRODUCT_ID] parent_schema = scheming_get_dataset_schema(pkg['type']) new_payload = { 'type': ds_type, 'top_parent_id': pkg.get('top_parent_id', pkg_id) or pkg_id } id_payload = { 'parentProductId': pkg['product_id_new'], 'parentProduct': pkg['product_id_new'], 'productType': str( parent_schema['dataset_type_code'] ), 'productTypeCode': str( parent_schema['dataset_type_code'] ) } if ds_type == 'format': new_payload['parent_id'] = pkg_id elif ds_type == 'issue': issue_number = next_issue_number(pkg_id) issue_id = u'{pid}{issue_number}'.format( pid=pkg_id, issue_number=issue_number ) new_payload['product_type_code'] = pkg.get('product_type_code') new_payload['issue_number'] = issue_number new_payload['product_id_new'] = issue_id new_payload['name'] = u'issue-{issue_id}'.format( issue_id=issue_id ) pass elif ds_type == 'article': article_id = next_article_id( pkg.get('top_parent_id', pkg_id) or pkg_id, pkg.get('issue_number') ) new_payload['product_type_code'] = pkg.get('product_type_code') new_payload['issue_number'] = pkg.get('issue_number') new_payload['product_id_new'] = article_id new_payload['name'] = u'article-{article_id}'.format( article_id=article_id ) pass elif ('non_data_product' in parent_schema and parent_schema['non_data_product'] == True): if is_legacy_product(pkg[PRODUCT_ID]): new_payload[PRODUCT_ID] = lc.action.GetNextLegacyProductId( **id_payload ) else: id_payload['subjectCode'] = pkg['subject_codes'][0] new_payload[PRODUCT_ID] = lc.action.GetNextNonDataProductId( **id_payload ) else: new_payload[PRODUCT_ID] = lc.action.GetNextProductId( **id_payload ) return PackageController().new(new_payload)
def _get_dataset_schema(): return scheming_get_dataset_schema('dataset')
def import_stage(self, harvest_object): log.debug('In PRDREngergyResourcesHarvester import_stage') if not harvest_object: log.error('No harvest object received') return False self._set_config(harvest_object.job.source.config) if self.force_import: status = 'change' else: status = self._get_object_extra(harvest_object, 'status') if status == 'delete': context = { 'model': model, 'session': model.Session, 'user': self._get_user_name() } p.toolkit.get_action('package_delete')( context, { 'id': harvest_object.package_id }) log.info('Deleted package {0} with guid {1}'.format( harvest_object.package_id, harvest_object.guid)) return True if harvest_object.content is None: self._save_object_error( 'Empty content for object %s' % harvest_object.id, harvest_object, 'Import') return False # Get the last harvested object (if any) previous_object = model.Session.query(HarvestObject) \ .filter(HarvestObject.guid == harvest_object.guid) \ .filter(HarvestObject.current == True) \ .first() # Flag previous object as not current anymore if previous_object and not self.force_import: previous_object.current = False previous_object.add() package_dict = self._get_package_dict(harvest_object) if not package_dict: return False if not package_dict.get('name'): package_dict['name'] = \ self._get_package_name(harvest_object, package_dict['title']) # copy across resource ids from the existing dataset, otherwise they'll # be recreated with new ids if status == 'change': existing_dataset = self._get_existing_dataset(harvest_object.guid) if existing_dataset: copy_across_resource_ids(existing_dataset, package_dict) # Allow custom harvesters to modify the package dict before creating # or updating the package package_dict = self.modify_package_dict(package_dict, harvest_object) # Unless already set by an extension, get the owner organization (if # any) from the harvest source dataset if not package_dict.get('owner_org'): source_dataset = model.Package.get(harvest_object.source.id) if source_dataset.owner_org: package_dict['owner_org'] = source_dataset.owner_org if not package_dict.get('license_id'): package_dict['license_id'] = 'notspecified' # Flag this object as the current one harvest_object.current = True harvest_object.add() context = { 'user': self._get_user_name(), 'return_id_only': True, 'ignore_auth': True, } package_schema = scheming_get_dataset_schema('dataset') field = scheming_field_by_name(package_schema['dataset_fields'], 'member_countries') choices = scheming_field_choices(field) mem_temp_list = [ x for x in package_dict['member_countries'] if x is not None ] package_dict['member_countries'] = [ choice['value'] for choice in choices if choice['label'] in mem_temp_list ] or ['other'] polygons = [ t['geometry'] for t in eez.collection if any(country in t['properties']['GeoName'] for country in mem_temp_list) ] # TODO: for now we are taking first polygon from possible # list because of SOLR restriction of spatial field # size. In future we may add additional logic here if polygons: package_dict['coverage'] = json.dumps(polygons[0]) if status == 'new': # context['schema'] = package_schema # We need to explicitly provide a package ID package_dict['id'] = unicode(uuid.uuid4()) # package_schema['id'] = [unicode] # Save reference to the package on the object harvest_object.package_id = package_dict['id'] harvest_object.add() # Defer constraints and flush so the dataset can be indexed with # the harvest object id (on the after_show hook from the harvester # plugin) model.Session.execute( 'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED') model.Session.flush() package_id = \ p.toolkit.get_action('package_create')(context, package_dict) log.info('Created dataset with id %s', package_id) elif status == 'change': package_dict['id'] = harvest_object.package_id try: package_id = \ p.toolkit.get_action('package_update')(context, package_dict) log.info('Updated dataset with id %s', package_id) except NotFound: log.info( 'Update returned NotFound, trying to create new Dataset.') if not harvest_object.package_id: package_dict['id'] = unicode(uuid.uuid4()) harvest_object.package_id = package_dict['id'] harvest_object.add() else: package_dict['id'] = harvest_object.package_id package_id = \ p.toolkit.get_action('package_create')(context, package_dict) log.info('Created dataset with id %s', package_id) model.Session.commit() stored_package = p.toolkit.get_action('package_show')(context.copy(), { 'id': package_id }) for res in stored_package.get('resources', []): p.toolkit.get_action('resource_create_default_resource_views')( context.copy(), { 'package': stored_package, 'resource': res }) return True
def before_index(self, data_dict): """ customize data sent to solr """ bogus_date = datetime.datetime(1, 1, 1) dataset_schema = scheming_get_dataset_schema( data_dict.get('type', 'unknown') ) if dataset_schema is None: raise ValidationError( 'Found no schema for following dataset :\n{dump}'.format( dump=json.dumps(data_dict, indent=2) ) ) # iterate through dataset fields defined in schema field_schema = dict() for dataset_field in dataset_schema['dataset_fields']: d = dataset_field field_schema[d['field_name']] = d index_data_dict = {} authors = [] # drop extras fields for dict_key in data_dict: if not dict_key.startswith('extras_'): index_data_dict[dict_key] = data_dict[dict_key] # iterate through validated data_dict fields and modify as needed validated_data_dict = json.loads(data_dict['validated_data_dict']) for item in validated_data_dict.keys(): value = validated_data_dict[item] if not value and item in index_data_dict: index_data_dict.pop(item) continue fs = field_schema.get(item, None) # ignore all fields not currently in the schema if not fs: continue field_type = fs.get('schema_field_type', 'string') multivalued = fs.get('schema_multivalued', False) if field_type == 'fluent': for key in value.keys(): label = u'{item}_{key}'.format( item=item, key=key ) index_data_dict[label] = value[key] # for code type, the en/fr labels need to be looked up # and sent to Solr elif field_type == 'code': lookup_type = fs.get('lookup', '') if lookup_type == 'codeset': lookup = fs.get('codeset_type', '') elif lookup_type == 'preset': lookup = fs.get('preset', '')[4:] else: lookup = fs.get('lookup', '') if lookup and value: label_en = u'{item}_desc_en'.format( item=item ) label_fr = u'{item}_desc_fr'.format( item=item ) if multivalued: desc_en = [] desc_fr = [] for v in value: if not v: continue desc = lookup_label(lookup, v, lookup_type) desc_en.append(desc[u'en']) desc_fr.append(desc[u'fr']) index_data_dict[str(item)] = value index_data_dict[label_en] = desc_en index_data_dict[label_fr] = desc_fr else: desc = lookup_label(lookup, value, lookup_type) index_data_dict[label_en] = desc[u'en'] index_data_dict[label_fr] = desc[u'fr'] elif field_type == 'date': if value: try: date = parse(value, default=bogus_date) if date != bogus_date: index_data_dict[item] = date.isoformat() + 'Z' except ValueError: continue elif item.endswith('_authors'): index_data_dict[str(item)] = value authors.extend(value) else: # all other field types if multivalued: index_data_dict[str(item)] = value else: index_data_dict[str(item)] = value if authors: index_data_dict['authors'] = authors index_data_dict['authors_initials'] = list( set( [strip_accents(i[0]).upper() for i in authors] ) ) return index_data_dict
def before_index(self, data_dict): """ customize data sent to solr :param data_dict: :type data_dict dict :returns dict """ dataset_schema = scheming_get_dataset_schema(data_dict.get('type')) if dataset_schema is None: raise ValidationError((_( 'Found no schema for following datasets:\n{dump}'.format( dump=json.dumps(data_dict, indent=2, sort_keys=True) ) ),)) field_schema = dict( (s['field_name'], s) for s in dataset_schema['dataset_fields'] ) index_data_dict = data_dict.copy() for k in data_dict: if k.startswith(u'extras_'): index_data_dict.pop(k, None) authors = [] default_date = datetime(1, 1, 1, 8, 30, 0, 0) validated_data_dict = json.loads(data_dict['validated_data_dict']) name = validated_data_dict.get(u'name') # append dguids from the datastore if validated_data_dict.get(u'product_id_new'): index_data_dict[u'dguid_codes'] = [] for dguid_pkg_id in geo.get_geodescriptors_for_package( validated_data_dict[u'product_id_new']): index_data_dict[u'dguid_codes'].append( helpers.get_dguid_from_pkg_id(dguid_pkg_id)) # strip the vintages from dguids to get geodescriptors index_data_dict[u'geodescriptor_codes'] = \ [g[4:] if is_dguid(g) else g for g in index_data_dict[u'dguid_codes'] if g] for item, value in validated_data_dict.iteritems(): fs = field_schema.get(item) # Do not index any field that is not currently in the schema. if not fs: continue field_type = fs.get('schema_field_type', 'string') # TODO: we're not using the multivalued schema field. Drop it? multivalued = fs.get('schema_multivalued', False) # Legacy issues numbers are non-numeric, which is problematic # for sorting and external tools. We can't just use a Solr # <copyTo> directive, as it'll fail entirely on a bad value. if name == 'issue_number': if value.isdigit(): index_data_dict['issue_number_int'] = int(value) # Fluent (multilingual) fields are really dictionaries, where # each key is the ISO language code, and the value the translated # text. We need to unpack these into individual solr fields # for per-language search. if field_type == 'fluent': if isinstance(value, dict): index_data_dict.update( (u'{0}_{1}'.format(item, k), v) for k, v in value.iteritems() ) else: raise ValidationError((_( '{name}: Expecting a fluent dict for {item}, ' 'instead got {value!r}'.format( name=name, item=item, value=value ) ), )) # Numeric foreign keys that need to be looked up to retrieve # their multilingual labels for searching. elif field_type == u'code': index_data_dict[unicode(item)] = value # These codes can refer to a codeset (a dataset of type # 'codeset' with a particular key), a preset (a hardcoded # value in a Scheming schema), or another dataset (lookup). lookup_type = fs.get(u'lookup', '') if lookup_type == u'codeset': lookup = fs.get(u'codeset_type', '') elif lookup_type == u'preset': lookup = fs.get(u'preset', '')[4:] else: lookup = fs.get(u'lookup', '') if not lookup: raise ValidationError((_( '{name}: unable to determine lookup ' 'for {item}'.format( name=name, item=item ) ), )) if isinstance(value, list): for value_to_lookup in value: if not value_to_lookup: continue desc = lookup_label( lookup, value_to_lookup, lookup_type ) for k, v in desc.iteritems(): if v and not k == u'found': n = u'{item}_desc_{key}'.format( item=item, key=k ) index_data_dict.update( {n: index_data_dict.get(n, []) + [v]} ) else: desc = lookup_label(lookup, value, lookup_type) index_data_dict.update(( u'{item}_desc_{key}'.format( item=item, key=k ), v) for k, v in desc.iteritems() if v and not k == u'found' ) if item == u'geodescriptor_codes': index_data_dict[u'dguid_codes'] = \ list(index_data_dict[u'geodescriptor_codes']) elif field_type == 'date': try: date = parse(value, default=default_date) index_data_dict[unicode(item)] = unicode( date.isoformat()[:19] + u'Z' ) except ValueError: continue elif item.endswith('_authors'): index_data_dict[unicode(item)] = value authors.extend(value) else: index_data_dict[unicode(item)] = value if authors: index_data_dict['authors'] = authors index_data_dict['authors_initials'] = list( set( [strip_accents(i[0]).upper() for i in authors] ) ) return index_data_dict
def qdes_datasets_with_empty_recommended_fields(context, config={}): u""" List of all datasets that have no values against recommended metadata fields. """ # Check access for sysadmin user's only check_access('config_option_update', context, None) # Get org_id config. org_id = config.get('org_id', None) # Get list of recommended fields. dataset_scheme = scheming_helpers.scheming_get_dataset_schema('dataset') dataset_recommended_fields = qdes_logic_helpers \ .qdes_get_recommended_dataset_fields(dataset_scheme, 'dataset_fields') dataset_resource_recommended_fields = qdes_logic_helpers \ .qdes_get_recommended_dataset_fields(dataset_scheme, 'resource_fields') # Build rows. rows = [] i = 0 limit = 10 has_result = True point_of_contacts = {} while has_result: packages = get_action('current_package_list_with_resources')( context, { 'limit': limit, 'offset': i }) if not packages: has_result = False else: i += limit for package in packages: if package.get('state') == 'active': # Load and cache point of contacts. contact_point_pos = package.get('contact_point', None) if not contact_point_pos in point_of_contacts: point_of_contacts[contact_point_pos] = qdes_logic_helpers \ .get_point_of_contact(context, contact_point_pos) if contact_point_pos else {} # Get package organization. pkg_org = package.get('organization') # Filter based on org_id or package type. if (org_id and pkg_org.get('id') != org_id ) or package.get('type') == 'dataservice': continue # Get missing value fields. missing_values = qdes_logic_helpers \ .qdes_check_recommended_field_value(package, dataset_recommended_fields) # Get contact point. contact_point = point_of_contacts.get(contact_point_pos) # Build row. if missing_values: row = qdes_logic_helpers \ .qdes_empty_recommended_field_row(package, contact_point, missing_values) rows.append(row) # Check dataset resource metadata fields. for resource in package.get('resources', []): # Get missing value fields. missing_values = qdes_logic_helpers \ .qdes_check_recommended_field_value(resource, dataset_resource_recommended_fields) # Build row. if missing_values: row = qdes_logic_helpers \ .qdes_empty_recommended_field_row(package, contact_point, missing_values, resource) rows.append(row) return rows
def before_index(self, pkg_dict): # Remove internal non-indexable fields # admin_notes pkg_dict.pop('admin_notes', None) pkg_dict.pop('extras_admin_notes', None) # sampling_procedure_notes pkg_dict.pop('sampling_procedure_notes', None) pkg_dict.pop('extras_sampling_procedure_notes', None) # response_rate_notes pkg_dict.pop('response_rate_notes', None) pkg_dict.pop('extras_response_rate_notes', None) # data_collection_notes pkg_dict.pop('data_collection_notes', None) pkg_dict.pop('extras_data_collection_notes', None) # weight_notes pkg_dict.pop('weight_notes', None) pkg_dict.pop('extras_weight_notes', None) # clean_ops_notes pkg_dict.pop('clean_ops_notes', None) pkg_dict.pop('extras_clean_ops_notes', None) # data_accs_notes pkg_dict.pop('data_accs_notes', None) pkg_dict.pop('extras_data_accs_notes', None) # Index labels on selected fields schema = scheming_get_dataset_schema('dataset') fields = [ 'data_collector', 'keywords', 'sampling_procedure', 'operational_purpose_of_data', 'data_collection_technique', 'process_status', 'identifiability' ] for field in fields: if pkg_dict.get(field): value = pkg_dict[field] try: values = json.loads(pkg_dict[field]) except ValueError: values = [value] out = [] for schema_field in schema['dataset_fields']: if schema_field['field_name'] == field: for item in values: for choice in schema_field['choices']: if choice['value'] == item: out.append(choice['label']) pkg_dict['vocab_' + field] = out # Index additional data for deposited dataset if pkg_dict.get('type') == 'deposited-dataset': # curator curator_id = pkg_dict.get('curator_id') if curator_id: try: curator = toolkit.get_action('user_show')( { 'ignore_auth': True }, { 'id': curator_id }) pkg_dict['curator_display_name'] = curator.get( 'display_name') except toolkit.ObjectNotFound: pass # depositor depositor_id = pkg_dict.get('creator_user_id') if depositor_id: try: depositor = toolkit.get_action('user_show')( { 'ignore_auth': True }, { 'id': depositor_id }) pkg_dict['depositor_display_name'] = depositor.get( 'display_name') except toolkit.ObjectNotFound: pass # data-container owner_org_dest_id = pkg_dict.get('owner_org_dest') if owner_org_dest_id: try: owner_org_dest = toolkit.get_action('organization_show')( { 'ignore_auth': True }, { 'id': owner_org_dest_id }) pkg_dict[ 'owner_org_dest_display_name'] = owner_org_dest.get( 'display_name') except toolkit.ObjectNotFound: pass return pkg_dict