def make_record(pkg_dict, repo=None): """Build and return a metadata record from a dataset's XML dump. Returns None on failure, or a loaded metadata record on success. """ global pycsw_context if not repo: repo = get_repo() # Load pkg-dict into a metadata object pkg_id = pkg_dict["id"] pkg_dtype = pkg_dict.get("dataset_type") obj = make_metadata(pkg_dtype, pkg_dict) # Generate an XML dump for current pkg-dict xser = xml_serializer_for(obj) xser.target_namespace = site_url xmldata = xser.to_xml() # Parse XML dump into a pyCSW metadata record record = None try: record = pycsw.metadata.parse_record(pycsw_context, xmldata, repo)[0] except Exception as err: log1.error("Cannot extract metadata for %s: %s" % (pkg_id, err)) else: log1.debug("Extracted metadata for dataset %s" % (pkg_id)) # Note The following should always hold true when #13 is resolved, and # identifier is linked to package.id at validation phase. # assert record.identifier == pkg_id if record: record.identifier = pkg_id return record
def make_record(pkg_dict, repo=None): '''Build and return a metadata record from a dataset's XML dump. Returns None on failure, or a loaded metadata record on success. ''' global pycsw_context if not repo: repo = get_repo() # Load pkg-dict into a metadata object pkg_id = pkg_dict['id'] pkg_dtype = pkg_dict.get('dataset_type') obj = make_metadata(pkg_dtype, pkg_dict) # Generate an XML dump for current pkg-dict xser = xml_serializer_for(obj) xser.target_namespace = site_url xmldata = xser.to_xml() # Parse XML dump into a pyCSW metadata record record = None try: record = pycsw.metadata.parse_record(pycsw_context, xmldata, repo)[0] except Exception as err: log1.error('Cannot extract metadata for %s: %s' % (pkg_id, err)) else: log1.debug('Extracted metadata for dataset %s' % (pkg_id)) # Note The following should always hold true when #13 is resolved, and # identifier is linked to package.id at validation phase. #assert record.identifier == pkg_id if record: record.identifier = pkg_id return record
def dataset_import(context, data_dict): '''Import a dataset from a given XML source. This action, depending also on the value of its flags, can raise one of: * actions.Invalid: received invalid input * actions.IdentifierConflict: a package with the same identifier already exists * actions.NameConflict: a package with the same name already exists * toolkit.ValidationError: validation fails while trying to create a package :param source: This is either a string representing a (local or external) URL or a file-like object. :type q: string or file-like :param dtype: the dataset-type i.e. the schema of imported metadata :type dtype: string :param owner_org: the machine-name for the owner organization :type owner_org: string :param continue_on_errors: hint on what to do when validation fails :type continue_on_errors: boolean :param rename_if_conflict: hint on what to do when a name conflict is encountered :type rename_if_conflict: boolean :rtype: basic info for the newly created package ''' # Read parameters try: source = data_dict['source'] except KeyError: raise Invalid({'source': 'The `source` parameter is required'}) dtype = data_dict.get('dtype', 'inspire') try: owner_org = data_dict['owner_org'] except KeyError: raise Invalid({'owner_org': 'The `owner_org` parameter is required.\n' 'Hint: Use `organization_list_for_user` to retrieve a valid list.'}) allow_rename = data_dict.get('rename_if_conflict', False) allow_validation_errors = data_dict.get('continue_on_errors', False) # Fetch raw XML data xmldata = None if isinstance(source, basestring): # Assume source is a URL if not source.startswith('http://'): source = pylons.config['ckan.site_url'] + source.strip('/') source = urlparse.urlparse(source) r1 = requests.get(source.geturl()) if not r1.ok: raise Invalid({'source': _('Cannot fetch metadata from source URL')}) elif not r1.headers['content-type'] in ['application/xml', 'text/xml']: raise Invalid({'source': _('The source does not contain XML data')}) else: xmldata = r1.content else: # Assume source is a file-like object try: xmldata = source.read() except: raise Invalid({'source': _('Cannot read from source')}) # Parse XML data as metadata of `dtype` schema obj = make_metadata(dtype) try: obj = xml_serializer_for(obj).loads(xmldata) except AssertionError as ex: raise ex except Exception as ex: # Map all parse exceptions to Invalid log.info('Failed to parse XML metadata: %s', ex) raise Invalid({'source': _('The given XML file is malformed: %s') % (ex)}) # Prepare package dict pkg_dict = {'version': '1.0'} pkg_dict.update(obj.deduce_fields()) pkg_dict.update({ 'owner_org': owner_org, 'type': 'dataset', 'dataset_type': dtype, dtype: obj.to_dict(flat=False), }) # If an identifier is passed, check that this is not already present. # Note This is no guarantee that the identifier will be available when # `package_create` is actually invoked. identifier = pkg_dict.get('id') if identifier and _check_package_id_exists(context, identifier): raise IdentifierConflict({ 'id': _('A package identified as %s already exists') % (identifier)}) # Find and assign a machine-name for this package # Note We just find the 1st available name. As noted before, this is no # guarantee that will be available when `package_create` is invoked. basename = pkg_dict['name'] max_num_probes = 10 if allow_rename else 1 name = _find_a_package_name(context, basename, max_num_probes) if not name: raise NameConflict({ 'name': _('The package name %r is not available') % (basename)}) else: pkg_dict['name'] = name pkg_dict['title'] += ' ' + name[len(basename):] # Create/Update package schema1, validation_errors, error_message = None, None, None if identifier: # Must override catalog-wide schema for actions in this context schema1 = lookup_package_plugin().create_package_schema() schema1['id'] = [unicode] ctx = _make_context(context) if schema1: ctx['schema'] = schema1 try: pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict) except toolkit.ValidationError as ex: if 'name' in ex.error_dict: # The name is probably taken, re-raise exception raise ex elif allow_validation_errors: # Save errors and retry with a different context validation_errors = ex.error_dict error_message = ex.message or _('The dataset contains invalid metadata') ctx = _make_context(context, skip_validation=True) if schema1: ctx['schema'] = schema1 pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict) log.warn('Forced to create an invalid package as %r ' % (name)) else: raise ex assert name == pkg_dict['name'] assert (not identifier) or (identifier == pkg_dict['id']) return { # Provide basic package fields 'id': pkg_dict['id'], 'name': name, 'title': pkg_dict['title'], 'state': pkg_dict.get('state'), # Provide details on validation (meaningfull if allow_validation_errors) 'validation': { 'message': error_message, 'errors': validation_errors, }, }
def dataset_import(context, data_dict): '''Import a dataset from a given XML source. This action, depending also on the value of its flags, can raise one of: * actions.Invalid: received invalid input * actions.IdentifierConflict: a package with the same identifier already exists * actions.NameConflict: a package with the same name already exists * toolkit.ValidationError: validation fails while trying to create a package :param source: This is either a string representing a (local or external) URL or a file-like object. :type q: string or file-like :param dtype: the dataset-type i.e. the schema of imported metadata :type dtype: string :param owner_org: the machine-name for the owner organization :type owner_org: string :param continue_on_errors: hint on what to do when validation fails :type continue_on_errors: boolean :param rename_if_conflict: hint on what to do when a name conflict is encountered :type rename_if_conflict: boolean :rtype: basic info for the newly created package ''' # Read parameters try: source = data_dict['source'] except KeyError: raise Invalid({'source': 'The `source` parameter is required'}) dtype = data_dict.get('dtype', 'datacite') try: owner_org = data_dict['owner_org'] except KeyError: raise Invalid({ 'owner_org': 'The `owner_org` parameter is required.\n' 'Hint: Use `organization_list_for_user` to retrieve a valid list.' }) allow_rename = data_dict.get('rename_if_conflict', False) allow_validation_errors = data_dict.get('continue_on_errors', False) log.debug('dtype: %s, source %s, source type: %s', dtype, source, type(source)) # Fetch raw XML data xmldata = None if isinstance(source, basestring): # Assume source is a URL if not source.startswith('http://'): source = pylons.config['ckan.site_url'] + source.strip('/') source = urlparse.urlparse(source) r1 = requests.get(source.geturl()) if not r1.ok: raise Invalid( {'source': _('Cannot fetch metadata from source URL')}) elif not r1.headers['content-type'] in ['application/xml', 'text/xml']: raise Invalid( {'source': _('The source does not contain XML data')}) else: xmldata = r1.content else: # Assume source is a file-like object try: log.debug('source is %s', source) xmldata = source.read() log.debug('xmldata is %s', xmldata) except: raise Invalid({'source': _('Cannot read from source')}) # Parse XML data as metadata of `dtype` schema obj = make_metadata(dtype) log.debug('obj is: %s', obj) try: obj = xml_serializer_for(obj).loads(xmldata) except AssertionError as ex: raise ex except Exception as ex: # Map all parse exceptions to Invalid log.info('Failed to parse XML metadata: %s', ex) raise Invalid( {'source': _('The given XML file is malformed: %s') % (ex)}) # Prepare package dict log.debug('updated obj is: %s', obj) pkg_dict = {'version': '1.0'} pkg_dict.update(obj.deduce_fields()) pkg_dict.update({ 'owner_org': owner_org, 'type': 'dataset', 'dataset_type': dtype, dtype: obj.to_dict(flat=False), }) log.debug('pkg_dict: %s', pkg_dict) # If an identifier is passed, check that this is not already present. # Note This is no guarantee that the identifier will be available when # `package_create` is actually invoked. identifier = pkg_dict.get('id') if identifier and _check_package_id_exists(context, identifier): raise IdentifierConflict({ 'id': _('A package identified as %s already exists') % (identifier) }) # Find and assign a machine-name for this package # Note We just find the 1st available name. As noted before, this is no # guarantee that will be available when `package_create` is invoked. basename = pkg_dict['name'] max_num_probes = 10 if allow_rename else 1 name = _find_a_package_name(context, basename, max_num_probes) if not name: raise NameConflict( {'name': _('The package name %r is not available') % (basename)}) else: pkg_dict['name'] = name pkg_dict['title'] += ' ' + name[len(basename):] # add core fields description and subject pkg_dict['notes'] = pkg_dict['datacite']['abstract'] pkg_dict['closed_tag'] = pkg_dict['datacite']['subject_closed'] #log.debug('abstract %s', pkg_dict['datacite']['abstract']) # Create/Update package schema1, validation_errors, error_message = None, None, None if identifier: # Must override catalog-wide schema for actions in this context schema1 = lookup_package_plugin().create_package_schema() schema1['id'] = [unicode] ctx = _make_context(context) if schema1: ctx['schema'] = schema1 try: pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict) except toolkit.ValidationError as ex: if 'name' in ex.error_dict: # The name is probably taken, re-raise exception raise ex elif allow_validation_errors: # Save errors and retry with a different context validation_errors = ex.error_dict error_message = ex.message or _( 'The dataset contains invalid metadata') ctx = _make_context(context, skip_validation=True) if schema1: ctx['schema'] = schema1 pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict) log.warn('Forced to create an invalid package as %r ' % (name)) else: raise ex assert name == pkg_dict['name'] assert (not identifier) or (identifier == pkg_dict['id']) return { # Provide basic package fields 'id': pkg_dict['id'], 'name': name, 'title': pkg_dict['title'], 'state': pkg_dict.get('state'), # Provide details on validation (meaningfull if allow_validation_errors) 'validation': { 'message': error_message, 'errors': validation_errors, }, }