def make_record(pkg_dict, repo=None):
    """Build and return a metadata record from a dataset's XML dump.
    
    Returns None on failure, or a loaded metadata record on success.
    """

    global pycsw_context

    if not repo:
        repo = get_repo()

    # Load pkg-dict into a metadata object

    pkg_id = pkg_dict["id"]
    pkg_dtype = pkg_dict.get("dataset_type")
    obj = make_metadata(pkg_dtype, pkg_dict)

    # Generate an XML dump for current pkg-dict

    xser = xml_serializer_for(obj)
    xser.target_namespace = site_url
    xmldata = xser.to_xml()

    # Parse XML dump into a pyCSW metadata record

    record = None
    try:
        record = pycsw.metadata.parse_record(pycsw_context, xmldata, repo)[0]
    except Exception as err:
        log1.error("Cannot extract metadata for %s: %s" % (pkg_id, err))
    else:
        log1.debug("Extracted metadata for dataset %s" % (pkg_id))

    # Note The following should always hold true when #13 is resolved, and
    # identifier is linked to package.id at validation phase.
    # assert record.identifier == pkg_id
    if record:
        record.identifier = pkg_id

    return record
Example #2
0
def make_record(pkg_dict, repo=None):
    '''Build and return a metadata record from a dataset's XML dump.
    
    Returns None on failure, or a loaded metadata record on success.
    '''

    global pycsw_context

    if not repo:
        repo = get_repo()

    # Load pkg-dict into a metadata object

    pkg_id = pkg_dict['id']
    pkg_dtype = pkg_dict.get('dataset_type')
    obj = make_metadata(pkg_dtype, pkg_dict)

    # Generate an XML dump for current pkg-dict

    xser = xml_serializer_for(obj)
    xser.target_namespace = site_url
    xmldata = xser.to_xml()

    # Parse XML dump into a pyCSW metadata record

    record = None
    try:
        record = pycsw.metadata.parse_record(pycsw_context, xmldata, repo)[0]
    except Exception as err:
        log1.error('Cannot extract metadata for %s: %s' % (pkg_id, err))
    else:
        log1.debug('Extracted metadata for dataset %s' % (pkg_id))

    # Note The following should always hold true when #13 is resolved, and
    # identifier is linked to package.id at validation phase.
    #assert record.identifier == pkg_id
    if record:
        record.identifier = pkg_id

    return record
def dataset_import(context, data_dict):
    '''Import a dataset from a given XML source.

    This action, depending also on the value of its flags, can raise one of:

      * actions.Invalid: received invalid input
      * actions.IdentifierConflict: a package with the same identifier already exists
      * actions.NameConflict: a package with the same name already exists
      * toolkit.ValidationError: validation fails while trying to create a package 

    :param source: This is either a string representing a (local or external) URL 
        or a file-like object.
    :type q: string or file-like
    
    :param dtype: the dataset-type i.e. the schema of imported metadata
    :type dtype: string

    :param owner_org: the machine-name for the owner organization 
    :type owner_org: string

    :param continue_on_errors: hint on what to do when validation fails
    :type continue_on_errors: boolean
    
    :param rename_if_conflict: hint on what to do when a name conflict is encountered
    :type rename_if_conflict: boolean

    :rtype: basic info for the newly created package 
    '''
      
    # Read parameters

    try:
        source = data_dict['source']
    except KeyError:
        raise Invalid({'source': 'The `source` parameter is required'})
    
    dtype = data_dict.get('dtype', 'inspire')

    try:
        owner_org = data_dict['owner_org']
    except KeyError:
        raise Invalid({'owner_org':
            'The `owner_org` parameter is required.\n'
            'Hint: Use `organization_list_for_user` to retrieve a valid list.'})
        
    allow_rename = data_dict.get('rename_if_conflict', False)
    allow_validation_errors = data_dict.get('continue_on_errors', False)

    # Fetch raw XML data
    
    xmldata = None
    
    if isinstance(source, basestring):
        # Assume source is a URL
        if not source.startswith('http://'):
            source = pylons.config['ckan.site_url'] + source.strip('/')
        source = urlparse.urlparse(source)
        r1 = requests.get(source.geturl())
        if not r1.ok:
            raise Invalid({'source': _('Cannot fetch metadata from source URL')})
        elif not r1.headers['content-type'] in ['application/xml', 'text/xml']:
            raise Invalid({'source': _('The source does not contain XML data')})
        else:
            xmldata = r1.content
    else:
        # Assume source is a file-like object
        try:
            xmldata = source.read()
        except:
            raise Invalid({'source': _('Cannot read from source')})

    # Parse XML data as metadata of `dtype` schema
    
    obj = make_metadata(dtype)
    try:
        obj = xml_serializer_for(obj).loads(xmldata)
    except AssertionError as ex:
        raise ex
    except Exception as ex:
        # Map all parse exceptions to Invalid
        log.info('Failed to parse XML metadata: %s', ex)
        raise Invalid({'source': _('The given XML file is malformed: %s') % (ex)})

    # Prepare package dict

    pkg_dict = {'version': '1.0'}
    pkg_dict.update(obj.deduce_fields())
    pkg_dict.update({ 
        'owner_org': owner_org,
        'type': 'dataset',
        'dataset_type': dtype,
        dtype: obj.to_dict(flat=False),
    })
    
    # If an identifier is passed, check that this is not already present.
    # Note This is no guarantee that the identifier will be available when
    # `package_create` is actually invoked.

    identifier = pkg_dict.get('id')
    if identifier and _check_package_id_exists(context, identifier):
        raise IdentifierConflict({
           'id':  _('A package identified as %s already exists') % (identifier)})
 
    # Find and assign a machine-name for this package
    # Note We just find the 1st available name. As noted before, this is no 
    # guarantee that will be available when `package_create` is invoked.
    
    basename = pkg_dict['name']
    max_num_probes = 10 if allow_rename else 1
    name = _find_a_package_name(context, basename, max_num_probes)
    if not name:
        raise NameConflict({
            'name': _('The package name %r is not available') % (basename)})
    else:
        pkg_dict['name'] = name
        pkg_dict['title'] += ' ' + name[len(basename):]
    
    # Create/Update package
    
    schema1, validation_errors, error_message = None, None, None
    
    if identifier:
        # Must override catalog-wide schema for actions in this context
        schema1 = lookup_package_plugin().create_package_schema()
        schema1['id'] = [unicode]
    
    ctx = _make_context(context)
    if schema1:
        ctx['schema'] = schema1
    
    try:
        pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict)
    except toolkit.ValidationError as ex:
        if 'name' in ex.error_dict:
            # The name is probably taken, re-raise exception
            raise ex
        elif allow_validation_errors:
            # Save errors and retry with a different context
            validation_errors = ex.error_dict
            error_message = ex.message or _('The dataset contains invalid metadata')
            ctx = _make_context(context, skip_validation=True)
            if schema1:
                ctx['schema'] = schema1
            pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict)
            log.warn('Forced to create an invalid package as %r ' % (name))
        else:
            raise ex

    assert name == pkg_dict['name']
    assert (not identifier) or (identifier == pkg_dict['id'])

    return {
        # Provide basic package fields
        'id': pkg_dict['id'], 
        'name': name,
        'title': pkg_dict['title'],
        'state': pkg_dict.get('state'),
        # Provide details on validation (meaningfull if allow_validation_errors)
        'validation': {
            'message': error_message,
            'errors': validation_errors,
        },
    }
Example #4
0
def dataset_import(context, data_dict):
    '''Import a dataset from a given XML source.

    This action, depending also on the value of its flags, can raise one of:

      * actions.Invalid: received invalid input
      * actions.IdentifierConflict: a package with the same identifier already exists
      * actions.NameConflict: a package with the same name already exists
      * toolkit.ValidationError: validation fails while trying to create a package 

    :param source: This is either a string representing a (local or external) URL 
        or a file-like object.
    :type q: string or file-like
    
    :param dtype: the dataset-type i.e. the schema of imported metadata
    :type dtype: string

    :param owner_org: the machine-name for the owner organization 
    :type owner_org: string

    :param continue_on_errors: hint on what to do when validation fails
    :type continue_on_errors: boolean
    
    :param rename_if_conflict: hint on what to do when a name conflict is encountered
    :type rename_if_conflict: boolean

    :rtype: basic info for the newly created package 
    '''

    # Read parameters
    try:
        source = data_dict['source']
    except KeyError:
        raise Invalid({'source': 'The `source` parameter is required'})

    dtype = data_dict.get('dtype', 'datacite')

    try:
        owner_org = data_dict['owner_org']
    except KeyError:
        raise Invalid({
            'owner_org':
            'The `owner_org` parameter is required.\n'
            'Hint: Use `organization_list_for_user` to retrieve a valid list.'
        })

    allow_rename = data_dict.get('rename_if_conflict', False)
    allow_validation_errors = data_dict.get('continue_on_errors', False)
    log.debug('dtype: %s, source %s, source type: %s', dtype, source,
              type(source))
    # Fetch raw XML data

    xmldata = None

    if isinstance(source, basestring):
        # Assume source is a URL
        if not source.startswith('http://'):
            source = pylons.config['ckan.site_url'] + source.strip('/')
        source = urlparse.urlparse(source)
        r1 = requests.get(source.geturl())
        if not r1.ok:
            raise Invalid(
                {'source': _('Cannot fetch metadata from source URL')})
        elif not r1.headers['content-type'] in ['application/xml', 'text/xml']:
            raise Invalid(
                {'source': _('The source does not contain XML data')})
        else:
            xmldata = r1.content
    else:
        # Assume source is a file-like object
        try:
            log.debug('source is %s', source)
            xmldata = source.read()
            log.debug('xmldata is %s', xmldata)
        except:
            raise Invalid({'source': _('Cannot read from source')})

    # Parse XML data as metadata of `dtype` schema

    obj = make_metadata(dtype)
    log.debug('obj is: %s', obj)
    try:
        obj = xml_serializer_for(obj).loads(xmldata)
    except AssertionError as ex:
        raise ex
    except Exception as ex:
        # Map all parse exceptions to Invalid
        log.info('Failed to parse XML metadata: %s', ex)
        raise Invalid(
            {'source': _('The given XML file is malformed: %s') % (ex)})

    # Prepare package dict
    log.debug('updated obj is: %s', obj)

    pkg_dict = {'version': '1.0'}
    pkg_dict.update(obj.deduce_fields())
    pkg_dict.update({
        'owner_org': owner_org,
        'type': 'dataset',
        'dataset_type': dtype,
        dtype: obj.to_dict(flat=False),
    })
    log.debug('pkg_dict: %s', pkg_dict)
    # If an identifier is passed, check that this is not already present.
    # Note This is no guarantee that the identifier will be available when
    # `package_create` is actually invoked.

    identifier = pkg_dict.get('id')
    if identifier and _check_package_id_exists(context, identifier):
        raise IdentifierConflict({
            'id':
            _('A package identified as %s already exists') % (identifier)
        })

    # Find and assign a machine-name for this package
    # Note We just find the 1st available name. As noted before, this is no
    # guarantee that will be available when `package_create` is invoked.

    basename = pkg_dict['name']
    max_num_probes = 10 if allow_rename else 1
    name = _find_a_package_name(context, basename, max_num_probes)
    if not name:
        raise NameConflict(
            {'name': _('The package name %r is not available') % (basename)})
    else:
        pkg_dict['name'] = name
        pkg_dict['title'] += ' ' + name[len(basename):]

    # add core fields description and subject
    pkg_dict['notes'] = pkg_dict['datacite']['abstract']
    pkg_dict['closed_tag'] = pkg_dict['datacite']['subject_closed']
    #log.debug('abstract %s', pkg_dict['datacite']['abstract'])
    # Create/Update package

    schema1, validation_errors, error_message = None, None, None

    if identifier:
        # Must override catalog-wide schema for actions in this context
        schema1 = lookup_package_plugin().create_package_schema()
        schema1['id'] = [unicode]

    ctx = _make_context(context)
    if schema1:
        ctx['schema'] = schema1

    try:
        pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict)
    except toolkit.ValidationError as ex:
        if 'name' in ex.error_dict:
            # The name is probably taken, re-raise exception
            raise ex
        elif allow_validation_errors:
            # Save errors and retry with a different context
            validation_errors = ex.error_dict
            error_message = ex.message or _(
                'The dataset contains invalid metadata')
            ctx = _make_context(context, skip_validation=True)
            if schema1:
                ctx['schema'] = schema1
            pkg_dict = _get_action('package_create')(ctx, data_dict=pkg_dict)
            log.warn('Forced to create an invalid package as %r ' % (name))
        else:
            raise ex

    assert name == pkg_dict['name']
    assert (not identifier) or (identifier == pkg_dict['id'])

    return {
        # Provide basic package fields
        'id': pkg_dict['id'],
        'name': name,
        'title': pkg_dict['title'],
        'state': pkg_dict.get('state'),
        # Provide details on validation (meaningfull if allow_validation_errors)
        'validation': {
            'message': error_message,
            'errors': validation_errors,
        },
    }