def test_basic_query(self):
        schema = default_create_package_schema()
        context = {'model':model,'session':Session,'user':'******','extras_as_string':True,'schema':schema,'api_version':2}
        package_dict = package_create(context,self.package_fixture_data)
        package_id = context.get('id')

        # Point inside bbox
        offset = self._offset_with_bbox()

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 1
        assert res_dict['results'][0] == package_id

        # Point outside bbox
        offset = self._offset_with_bbox(-10,10,-20,20)

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 0
        assert res_dict['results'] == []

        # Delete the package and ensure it does not come up on
        # search results
        package_delete(context,{'id':package_id})
        offset = self._offset_with_bbox()

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 0
        assert res_dict['results'] == []
Example #2
0
def create_package_schema():
    """
    Add our custom fields for validation from the form
    """
    schema = default_create_package_schema()
    _schema_update(schema, 'create')
    return schema
Example #3
0
 def setup(self):
     super(TestProjectBase, self).setup()
     self.user = factories.User()
     context = {
         'model': model,
         'session': model.Session,
         'user': self.user['name'],
     }
     org_create_context = context.copy()
     org_create_context['schema'] = schema.default_group_schema()
     self.organization = helpers.call_action(
         'organization_create',
         context=context,
         id='1',
         name='organization'
     )
     project_context = context.copy()
     project_context['schema'] = schema.default_create_package_schema()
     self.project = helpers.call_action(
         'package_create',
         context=project_context,
         type='project',
         id='1',
         name='test',
         title='Test',
         owner_org=self.organization['name'],
     )
Example #4
0
 def setup(self):
     super(TestProjectBase, self).setup()
     self.user = factories.User()
     context = {
         'model': model,
         'session': model.Session,
         'user': self.user['name'],
     }
     org_create_context = context.copy()
     org_create_context['schema'] = schema.default_group_schema()
     self.organization = helpers.call_action('organization_create',
                                             context=context,
                                             id='1',
                                             name='organization')
     project_context = context.copy()
     project_context['schema'] = schema.default_create_package_schema()
     self.project = helpers.call_action(
         'package_create',
         context=project_context,
         type='project',
         id='1',
         name='test',
         title='Test',
         owner_org=self.organization['name'],
     )
Example #5
0
    def form_to_db_schema_options(self, options={}):
        context = options.get('context', {})
        schema = context.get('schema', None)
        if schema:
            return schema

        elif options.get('api'):
            if options.get('type') == 'create':
                return default_schema.default_create_package_schema()
            else:
                return default_schema.default_update_package_schema()

        schema = self.form_to_db_schema()

        # Sysadmins can save UKLP datasets with looser validation
        # constraints.  This is because UKLP datasets are created using
        # a custom schema passed in from the harvester.  However, when it
        # comes to re-saving the dataset via the dataset form, there are
        # some validation requirements we need to drop.  That's what this
        # section of code does.
        pkg = context.get('package')
        user = context.get('user', '')
        if Authorizer().is_sysadmin(unicode(user)) and \
           pkg and pkg.extras.get('UKLP', 'False') == 'True':
            schema.update(self._uklp_sysadmin_schema_updates)

        return schema
Example #6
0
    def test_1_basic(self):
        schema = default_create_package_schema()
        context = {
            'model': model,
            'session': Session,
            'user': '******',
            'extras_as_string': True,
            'schema': schema,
            'api_version': 2
        }
        package_dict_1 = package_create(context, self.package_fixture_data_1)
        del context['package']
        package_dict_2 = package_create(context, self.package_fixture_data_2)

        postparams = '%s=1' % json.dumps(
            {
                'q': 'test',
                'facet.field': ('groups', 'tags', 'res_format', 'license'),
                'rows': 20,
                'start': 0,
                'extras': {
                    'ext_bbox': '%s,%s,%s,%s' % (10, 10, 40, 40)
                }
            })
        res = self.app.post('/api/action/package_search', params=postparams)
        res = json.loads(res.body)
        result = res['result']

        # Only one dataset returned
        assert_equal(res['success'], True)
        assert_equal(result['count'], 1)
        assert_equal(result['results'][0]['name'],
                     'test-spatial-dataset-search-point-2')
Example #7
0
 def create_package_schema(self):
     schema = default_create_package_schema()
     schema.update({
         'vocab_tags': [ignore_missing,
                        convert_to_tags(TEST_VOCAB_NAME)],
     })
     return schema
Example #8
0
    def form_to_db_schema_options(self, options={}):
        context = options.get('context', {})
        schema = context.get('schema',None)
        if schema:
            return schema

        elif options.get('api'):
            if options.get('type') == 'create':
                return default_schema.default_create_package_schema()
            else:
                return default_schema.default_update_package_schema()

        schema = self.form_to_db_schema()
        # Sysadmins can save UKLP datasets with looser validation
        # constraints.  This is because UKLP datasets are created using
        # a custom schema passed in from the harvester.  However, when it
        # comes to re-saving the dataset via the dataset form, there are
        # some validation requirements we need to drop.  That's what this
        # section of code does.
        pkg = context.get('package')
        user = context.get('user', '')
        if Authorizer().is_sysadmin(unicode(user)) and \
           pkg and pkg.extras.get('UKLP', 'False') == 'True':
            schema.update(self._uklp_sysadmin_schema_updates)
        if Authorizer().is_sysadmin(unicode(user)) and \
           pkg and pkg.extras.get('external_reference') == 'ONSHUB':
            self._ons_sysadmin_schema_updates(schema)
        return schema
Example #9
0
 def create_package_schema(self):
     schema = default_create_package_schema()
     schema.update(
         relationships_as_object=default_relationship_schema(),
         relationships_as_subject=default_relationship_schema(),
     )
     return schema
Example #10
0
def datajson_create(context, data_dict):
    model = context['model']
    new_package = create_data_dict(data_dict)
    owner_org = model.Group.get(new_package['owner_org'])
    group_name = new_package.pop('owner_name', None)
    new_package['name'] = _slugify(new_package['title'])[:80]
    existing_package = model.Package.get(new_package['name'])
    if existing_package:
        new_package[
            'name'] = new_package['name'] + '-' + new_package['id'].lower()

    if not owner_org:
        p.toolkit.get_action('organization_create')(context, {
            'name':
            new_package['owner_org'],
            'title':
            group_name,
            'extras': [{
                'key': 'organization_type',
                'value': "Federal Government"
            }]
        })

    context['schema'] = schema.default_create_package_schema()
    context['schema']['id'] = [p.toolkit.get_validator('not_empty')]
    context['return_id_only'] = True
    return p.toolkit.get_action('package_create')(context, new_package)
Example #11
0
def create_package_schema():
    """
    Add our custom fields for validation from the form
    """
    schema = default_create_package_schema()
    _schema_update(schema, 'create')
    return schema
Example #12
0
 def create_package(cls, **package_dict):
     context = {'model': model,
                'session': model.Session,
                'user': '******',
                'extras_as_string': True,
                'schema': default_create_package_schema(),
                'api_version': 2}
     package_dict = package_create(context, package_dict)
     return context.get('id')
 def create_package(cls, **package_dict):
     context = {'model': model,
                'session': model.Session,
                'user': '******',
                'extras_as_string': True,
                'schema': default_create_package_schema(),
                'api_version': 2}
     package_dict = package_create(context, package_dict)
     return context.get('id')
def create_package_schema():
    schema = default_create_package_schema()

    _modify_schema(schema)

    schema['name'].append(no_pending_dataset_with_same_name)

    schema['title'].extend([
        unique_title_within_organization,
        no_pending_dataset_with_same_title_in_same_org,
    ])

    return schema
Example #15
0
def project_create_schema():
    schema = default_create_package_schema()
    schema.update({
        'id': [if_empty_generate_uuid],
        'title': [not_missing, unicode,
                  project_title_blacklist_char_validator],
        'name': [ignore_missing, unicode,
                 slugify_title_to_name,
                 project_name_validator],
        'ona_api_key': [ignore_missing, unicode],

        '__after': [create_cadasta_project],
    })
    schema.update(project_schema())
    return schema
Example #16
0
def package_create_validate(context, data_dict):
    model = context['model']
    user = context['user']
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context
    
    check_access('package_create',context,data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))
    else:
        return data
Example #17
0
def package_create_validate(context, data_dict):
    model = context['model']
    user = context['user']
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context

    check_access('package_create', context, data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))
    else:
        return data
def doi_create(context, data_dict):
    model = context['model']
    new_package = data_dict
    source_hash = hashlib.sha1(json.dumps(data_dict,
                                          sort_keys=True)).hexdigest()
    new_package["extras"].append({"key": "source_hash", "value": source_hash})
    new_package["extras"].append({"key": "metadata-source", "value": "doi"})
    new_package["extras"].append({
        "key": "source_doi_import_identifier",
        "value": True
    })
    owner_org = model.Group.get(
        ORG_MAPPING.get(new_package['organization']['name']))
    if not owner_org:
        print str(
            datetime.datetime.now()) + ' Fail to import doi id ' + new_package[
                'id'] + '. Organization ' + new_package['organization'][
                    'name'] + ' does not exist.'
        return
    new_package['owner_org'] = owner_org.name
    group_name = new_package.pop('owner_name', None)
    new_package['name'] = _slugify(new_package['title'])[:80]
    existing_package = model.Package.get(new_package['name'])
    if existing_package:
        new_package['name'] = new_package['name'] + '-' + str(int(time.time()))

    resources = []
    for resource in new_package['resources']:
        resource.pop('resource_group_id', None)
        resource.pop('revision_id', None)
        resource.pop('id', None)
        resources.append(resource)
    new_package['resources'] = resources

    obj = HarvestObject(guid=uuid.uuid4().hex,
                        job=context['harvest_job'],
                        content=context['harvestobj'])
    obj.save()
    new_package["extras"].append({"key": "harvest_object_id", "value": obj.id})

    context['schema'] = schema.default_create_package_schema()
    context['schema']['id'] = [p.toolkit.get_validator('not_empty')]
    context['return_id_only'] = True
    p.toolkit.get_action('package_create')(context, new_package)
    print str(
        datetime.datetime.now()) + ' Imported doi id ' + new_package['id']
Example #19
0
def package_create(context, data_dict):

    model = context['model']
    user = context['user']
    preview = context.get('preview', False)
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context

    check_access(model.System(), model.Action.PACKAGE_CREATE, context)
    check_group_auth(context, data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))

    if not preview:
        rev = model.repo.new_revision()
        rev.author = user
        if 'message' in context:
            rev.message = context['message']
        else:
            rev.message = _(u'REST API: Create object %s') % data.get("name")

    pkg = package_dict_save(data, context)
    admins = []
    if user:
        admins = [model.User.by_name(user.decode('utf8'))]

    if not preview:
        model.setup_default_user_roles(pkg, admins)
        for item in PluginImplementations(IPackageController):
            item.create(pkg)
        model.repo.commit()

    ## need to let rest api create and preview
    context["package"] = pkg
    ## this is added so that the rest controller can make a new location
    context["id"] = pkg.id
    log.debug('Created object %s' % str(pkg.name))
    if not preview:
        return package_dictize(pkg, context)
    else:
        return data
Example #20
0
def package_create(context, data_dict):

    model = context['model']
    user = context['user']
    preview = context.get('preview', False)
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context

    check_access(model.System(), model.Action.PACKAGE_CREATE, context)
    check_group_auth(context, data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))

    if not preview:
        rev = model.repo.new_revision()
        rev.author = user
        if 'message' in context:
            rev.message = context['message']
        else:
            rev.message = _(u'REST API: Create object %s') % data.get("name")

    pkg = package_dict_save(data, context)
    admins = []
    if user:
        admins = [model.User.by_name(user.decode('utf8'))]

    if not preview:
        model.setup_default_user_roles(pkg, admins)
        for item in PluginImplementations(IPackageController):
            item.create(pkg)
        model.repo.commit()        

    ## need to let rest api create and preview
    context["package"] = pkg
    ## this is added so that the rest controller can make a new location 
    context["id"] = pkg.id
    log.debug('Created object %s' % str(pkg.name))
    if not preview:
        return package_dictize(pkg, context) 
    else:
        return data
Example #21
0
    def package_create(self, context, data_dict):

        preview = context.get('preview', False)
        schema = context.get('schema') or default_create_package_schema()
        if preview:
            return
        session = context['model'].Session
        url = urlparse.urljoin(self.base_url, 'services/package.json')
        data_dict['body'] = data_dict.get('notes', '')

        ## run through validate to make sure tags are in correct place
        data, errors = validate(data_dict, schema, context)
        terms = {}
        for num, tag in enumerate(data.get('tags', [])):
            terms[str(num)] = tag['name']
        data_dict['terms'] = terms

        data = json.dumps({'data': data_dict})
        req = urllib2.Request(url, data, {'Content-type': 'application/json'})
        ##XXX think about error conditions a bit more
        f = urllib2.urlopen(req, None, 3)
        try:
            drupal_info = json.loads(f.read())
        finally:
            f.close()
        nid = drupal_info['nid']
        context['nid'] = nid
        try:
            package_create = create.package_create(context, data_dict)
        except:
            url = urlparse.urljoin(self.base_url,
                                   'services/package/%s.json' % (nid))
            req = urllib2.Request(url)
            req.get_method = lambda: 'DELETE'
            f = urllib2.urlopen(req, None, 3)
            try:
                drupal_info = f.read()
            finally:
                f.close()
            raise

        package_create['nid'] = context['nid']
        package_create['revision_message'] = '%s-%s' % (
            session.revision.id, session.revision.message)
        return package_create
Example #22
0
def package_create_validate(context, data_dict):
    model = context['model']
    user = context['user']
    preview = context.get('preview', False)
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context

    check_access(model.System(), model.Action.PACKAGE_CREATE, context)
    check_group_auth(context, data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))
    else:
        return data
Example #23
0
def package_create_validate(context, data_dict):
    model = context['model']
    user = context['user']
    preview = context.get('preview', False)
    schema = context.get('schema') or default_create_package_schema()
    model.Session.remove()
    model.Session()._context = context

    check_access(model.System(), model.Action.PACKAGE_CREATE, context)
    check_group_auth(context, data_dict)

    data, errors = validate(data_dict, schema, context)

    if errors:
        model.Session.rollback()
        raise ValidationError(errors, package_error_summary(errors))
    else:
        return data
Example #24
0
 def validate(self, context, data_dict, schema, action):
     if action in ('package_update', 'package_create'):
         # If the caller to package_update specified a schema (e.g.
         # harvesters specify the default schema) then we don't want to
         # override that.
         if not context.get('schema'):
             if 'api_version' in context:
                 # When accessed by the API, just use the default schemas.
                 # It's only the forms that are customized to make it easier
                 # for humans.
                 if action == 'package_create':
                     schema = default_schema.default_create_package_schema()
                 else:
                     schema = default_schema.default_update_package_schema()
             else:
                 # Customized schema for DGU form
                 schema = self.form_to_db_schema_options(context)
     return toolkit.navl_validate(data_dict, schema, context)
Example #25
0
    def package_create(self, context, data_dict):

        preview = context.get('preview', False)
        schema = context.get('schema') or default_create_package_schema()
        if preview:
            return
        session = context['model'].Session
        url = urlparse.urljoin(self.base_url, 'services/package.json')
        data_dict['body'] = data_dict.get('notes', '')

        ## run through validate to make sure tags are in correct place
        data, errors = validate(data_dict, schema, context)
        terms = {}
        for num, tag in enumerate(data.get('tags', [])):
            terms[str(num)] = tag['name']
        data_dict['terms'] = terms

        data = json.dumps({'data': data_dict})
        req = urllib2.Request(url, data, {'Content-type': 'application/json'})
        ##XXX think about error conditions a bit more
        f = urllib2.urlopen(req, None, 3)
        try:
            drupal_info = json.loads(f.read())
        finally:
            f.close()
        nid = drupal_info['nid']
        context['nid'] = nid
        try:
            package_create = create.package_create(context, data_dict)
        except:
            url = urlparse.urljoin(self.base_url, 'services/package/%s.json' % (nid))
            req = urllib2.Request(url)
            req.get_method = lambda: 'DELETE'
            f = urllib2.urlopen(req, None, 3)
            try:
                drupal_info = f.read()
            finally:
                f.close()
            raise

        package_create['nid'] = context['nid']
        package_create['revision_message'] = '%s-%s'%(session.revision.id,session.revision.message)
        return package_create
Example #26
0
def datajson_create(context, data_dict):
    model = context['model']
    new_package = create_data_dict(data_dict)
    owner_org = model.Group.get(new_package['owner_org'])
    group_name = new_package.pop('owner_name', None)
    new_package['name'] = _slugify(new_package['title'])[:80]
    existing_package = model.Package.get(new_package['name'])
    if existing_package:
        new_package['name'] = new_package['name'] + '-' + new_package['id'].lower()

    if not owner_org:
        p.toolkit.get_action('organization_create')(
            context,
            {'name': new_package['owner_org'], 'title': group_name,
             'extras': [{'key': 'organization_type', 'value': "Federal Government"}]})

    context['schema'] = schema.default_create_package_schema()
    context['schema']['id'] = [p.toolkit.get_validator('not_empty')]
    context['return_id_only'] = True
    return p.toolkit.get_action('package_create')(context, new_package)
Example #27
0
    def test_basic_query(self):
        schema = default_create_package_schema()
        context = {
            'model': model,
            'session': Session,
            'user': '******',
            'extras_as_string': True,
            'schema': schema,
            'api_version': 2
        }
        package_dict = package_create(context, self.package_fixture_data)
        package_id = context.get('id')

        # Point inside bbox
        offset = self._offset_with_bbox()

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 1
        assert res_dict['results'][0] == package_id

        # Point outside bbox
        offset = self._offset_with_bbox(-10, 10, -20, 20)

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 0
        assert res_dict['results'] == []

        # Delete the package and ensure it does not come up on
        # search results
        package_delete(context, {'id': package_id})
        offset = self._offset_with_bbox()

        res = self.app.get(offset, status=200)
        res_dict = self.data_from_res(res)

        assert res_dict['count'] == 0
        assert res_dict['results'] == []
Example #28
0
def package_create_schema():
    schema = default_create_package_schema()
    schema.update({
        'frequency_time_modifier':
        [ignore_missing, unicode, convert_to_extras],
        'frequency_count': [ignore_missing, convert_to_extras],
        'frequency_update_period':
        [ignore_missing, unicode, convert_to_extras],
        'frequency_period': [ignore_missing, unicode, convert_to_extras],
        # frequency is constructed from the other frequency_ fields
        'frequency': [ignore_missing],
        'retention_count':
        [ignore_missing, is_positive_integer, convert_to_extras],
        'retention_period': [ignore_missing, unicode, convert_to_extras],
        'delivery_unit': [ignore_missing, unicode, convert_to_extras],
        'service': [ignore_missing, unicode, convert_to_extras],
        'next_update': [ignore_missing, unicode, convert_to_extras],
        'review_date': [ignore_missing, unicode, convert_to_extras],
        'coverage_start_date': [ignore_missing, unicode, convert_to_extras],
        'coverage_end_date': [ignore_missing, unicode, convert_to_extras],
    })
    return schema
Example #29
0
def doi_create(context, data_dict):
    model = context['model']
    new_package = data_dict
    source_hash = hashlib.sha1(json.dumps(data_dict, sort_keys=True)).hexdigest()
    new_package["extras"].append({"key": "source_hash", "value": source_hash})
    new_package["extras"].append({"key": "metadata-source", "value": "doi"})
    new_package["extras"].append({"key": "source_doi_import_identifier", "value": True})
    owner_org = model.Group.get(ORG_MAPPING.get(new_package['organization']['name']))
    if not owner_org:
        print str(datetime.datetime.now()) + ' Fail to import doi id ' + new_package['id'] + '. Organization ' + new_package['organization']['name'] + ' does not exist.'
        return
    new_package['owner_org'] = owner_org.name
    group_name = new_package.pop('owner_name', None)
    new_package['name'] = _slugify(new_package['title'])[:80]
    existing_package = model.Package.get(new_package['name'])
    if existing_package:
        new_package['name'] = new_package['name'] + '-' + str(int(time.time()))

    resources = []
    for resource in new_package['resources']:
        resource.pop('resource_group_id', None)
        resource.pop('revision_id', None)
        resource.pop('id', None)
        resources.append(resource)
    new_package['resources'] = resources

    obj = HarvestObject(
        guid=uuid.uuid4().hex,
        job=context['harvest_job'],
        content=context['harvestobj'])
    obj.save()
    new_package["extras"].append({"key": "harvest_object_id", "value": obj.id})

    context['schema'] = schema.default_create_package_schema()
    context['schema']['id'] = [p.toolkit.get_validator('not_empty')]
    context['return_id_only'] = True
    p.toolkit.get_action('package_create')(context, new_package)
    print str(datetime.datetime.now()) + ' Imported doi id ' + new_package['id']
Example #30
0
def package_create_schema():
    schema = default_create_package_schema()
    schema.update({
        'frequency_time_modifier': [ignore_missing, unicode,
                                    convert_to_extras],
        'frequency_count': [ignore_missing, convert_to_extras],
        'frequency_update_period': [ignore_missing, unicode,
                                    convert_to_extras],
        'frequency_period': [ignore_missing, unicode, convert_to_extras],
        # frequency is constructed from the other frequency_ fields
        'frequency': [ignore_missing],

        'retention_count': [ignore_missing, is_positive_integer,
                            convert_to_extras],
        'retention_period': [ignore_missing, unicode, convert_to_extras],
        'delivery_unit': [ignore_missing, unicode, convert_to_extras],
        'service': [ignore_missing, unicode, convert_to_extras],
        'next_update': [ignore_missing, unicode, convert_to_extras],
        'review_date': [ignore_missing, unicode, convert_to_extras],
        'coverage_start_date': [ignore_missing, unicode, convert_to_extras],
        'coverage_end_date': [ignore_missing, unicode, convert_to_extras],
    })
    return schema
    def test_1_basic(self):
        schema = default_create_package_schema()
        context = {'model':model,'session':Session,'user':'******','extras_as_string':True,'schema':schema,'api_version':2}
        package_dict_1 = package_create(context,self.package_fixture_data_1)
        del context['package']
        package_dict_2 = package_create(context,self.package_fixture_data_2)

        postparams = '%s=1' % json.dumps({
                'q': 'test',
                'facet.field': ('groups', 'tags', 'res_format', 'license'),
                'rows': 20,
                'start': 0,
                'extras': {
                    'ext_bbox': '%s,%s,%s,%s' % (10,10,40,40)
                }
            })
        res = self.app.post('/api/action/package_search', params=postparams)
        res = json.loads(res.body)
        result = res['result']

        # Only one dataset returned
        assert_equal(res['success'], True)
        assert_equal(result['count'], 1)
        assert_equal(result['results'][0]['name'], 'test-spatial-dataset-search-point-2')
Example #32
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''

        log.debug('_create_or_update_package')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')

                #TODO: use site user when available
                user_name = self.config.get('user', u'harvest')
            else:
                api_version = 2
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags]
            tags = list(set(tags))
            package_dict['tags'] = tags
            #log.debug('tag tag tag tag')
            #log.debug(tag)


            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)
                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Check if name has not already been used
                package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Example #33
0
 def create_package_schema(self):
     from ckan.logic.schema import default_create_package_schema
     schema = schema_defs.create_package_schema(
         default_create_package_schema())
     schema = self._modify_package_schema(schema)
     return schema
Example #34
0
    def _create_package_schema(cls):
        """ Create common schema for dataset create and update. Used by user interfaces and harvesters.
        """
        # Note: harvester schemas

        schema = default_create_package_schema()
        schema.pop('author')

        for key in settings.KATA_FIELDS_REQUIRED:
            schema[key] = [not_empty, co.convert_to_extras_kata, unicode, va.validate_general]
        for key in settings.KATA_FIELDS_RECOMMENDED:
            schema[key] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general]

        schema['accept-terms'] = [va.usage_terms_accepted, ignore]
        schema['__after'] = [co.gen_translation_str_from_langtitle,
                             co.gen_translation_str_from_langnotes]
        schema['agent'] = {'role': [not_empty, va.check_agent_fields, va.validate_general, unicode, co.flattened_to_extras],
                           'name': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                           'id': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras],
                           'organisation': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                           'URL': [ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras],
                           'fundingid': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras]}
        schema['contact'] = {'name': [not_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                             'email': [not_empty, co.remove_trailing_spaces, unicode, va.validate_email, co.flattened_to_extras],
                             'URL': [ignore_empty, co.remove_trailing_spaces, url_validator, va.validate_general, unicode, co.flattened_to_extras],
                             # phone number can be missing from the first users
                             'phone': [ignore_missing, co.remove_trailing_spaces, unicode, va.validate_phonenum, co.flattened_to_extras]}
        schema['event'] = {'type': [ignore_missing, va.check_events, unicode, co.flattened_to_extras, va.validate_general],
                           'who': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric],
                           'when': [ignore_missing, unicode, co.flattened_to_extras, va.validate_kata_interval_date],
                           'descr': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric]}
        schema['id'] = [not_empty, va.validate_package_id_format, unicode]

        # Langtitle fields are used by the UI, to construct a 'title' field with translations in JSON format
        # This is not necessarily needed for the API calls
        schema['langtitle'] = {'value': [unicode, va.validate_title, va.validate_title_duplicates, co.escape_quotes],
                               'lang': [unicode, co.convert_languages]}

        # The title field contains all the title translations in JSON format.
        # The converter gen_translation_str_from_langtitle
        # needs to be called to construct the JSON string from the UI's langtitle fields.
        schema['title'] = [va.not_empty_if_langtitle_empty]

        # Description (notes) is a multilanguage field similar to title
        schema['langnotes'] = {'value': [unicode, va.validate_notes_duplicates, co.escape_quotes],
                               'lang': [unicode, co.convert_languages]}
        schema['notes'] = [ignore_empty]

        schema['language'] = \
            [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode]
        schema['license_id'] = [co.to_license_id, unicode]
        schema['temporal_coverage_begin'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['temporal_coverage_end'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['pids'] = {'provider': [ignore_missing, unicode, co.flattened_to_extras],
                          'id': [not_empty, va.validate_general, va.validate_primary_pid_uniqueness,
                                 unicode, co.flattened_to_extras],
                          'type': [not_missing, co.remove_trailing_spaces, va.validate_pid_type, unicode, co.flattened_to_extras],
                          'relation': [ignore_missing, co.remove_trailing_spaces, co.to_relation, va.validate_pid_relation_type,
                                       unicode, co.flattened_to_extras]}
        schema['tag_string'] = [ignore_missing, not_empty, va.kata_tag_string_convert]
        # otherwise the tags would be validated with default tag validator during update
        schema['tags'] = cls.tags_schema()
        schema['xpaths'] = [ignore_missing, co.to_extras_json]
        schema['version'] = [not_empty, unicode, va.validate_kata_date]
        schema['availability'] = [not_missing, va.validate_availability, co.convert_to_extras_kata]
        schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata]
        schema['__extras'] = [va.check_agent, va.check_contact, va.check_langtitle]
        schema['__junk'] = [va.check_junk]
        schema['name'] = [va.continue_if_missing, co.default_name_from_id, unicode, package_name_validator,
                          va.validate_general]
        schema['external_id'] = [ignore_missing, co.remove_trailing_spaces, co.convert_external_id, va.validate_external_id_uniqueness, unicode, va.validate_general,
                                   co.convert_to_extras_kata]
        schema['access_application_download_URL'] = [ignore_missing, co.remove_trailing_spaces, va.validate_access_application_download_url,
                                                     unicode, va.validate_general, co.convert_to_extras_kata]
        schema['access_application_URL'] = [ignore_missing, co.remove_trailing_spaces, va.validate_access_application_url,
                                            unicode, va.validate_general, co.convert_to_extras_kata]
        schema['access_request_URL'] = [ignore_missing, co.remove_trailing_spaces, va.check_access_request_url, url_validator,
                                        unicode, va.validate_general, co.convert_to_extras_kata]
        schema['discipline'] = [ignore_missing, va.validate_discipline, co.convert_to_extras_kata, unicode]
        schema['geographic_coverage'] = [ignore_missing, va.validate_spatial, co.convert_to_extras_kata, unicode]
        schema['license_URL'] = [va.continue_if_missing, va.validate_license_url, co.populate_license_URL_if_license_id_not_resolved, co.convert_to_extras_kata, unicode,
                                 va.validate_general]
        schema['owner_org'] = [va.kata_owner_org_validator, unicode]
        schema['resources']['url'] = [default(settings.DATASET_URL_UNKNOWN), va.check_resource_url_for_direct_download_url,
                                      unicode, va.validate_general]
        # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource()
        schema['resources']['algorithm'] = [ignore_missing, unicode, va.validate_algorithm]
        schema['resources']['format'] = [ignore_missing, unicode, va.validate_general]
        schema['resources']['hash'].append(va.validate_general)
        schema['resources']['mimetype'].append(va.validate_mimetype)

        return schema
Example #35
0
    def _create_package_schema(cls):
        """ Create common schema for dataset create and update. Used by user interfaces and harvesters.
        """
        # Note: harvester schemas

        schema = default_create_package_schema()
        schema.pop('author')

        for key in settings.KATA_FIELDS_REQUIRED:
            schema[key] = [
                not_empty, co.convert_to_extras_kata, unicode,
                va.validate_general
            ]
        for key in settings.KATA_FIELDS_RECOMMENDED:
            schema[key] = [
                ignore_missing, co.convert_to_extras_kata, unicode,
                va.validate_general
            ]

        schema['accept-terms'] = [va.usage_terms_accepted, ignore]
        schema['__after'] = [
            co.gen_translation_str_from_langtitle,
            co.gen_translation_str_from_langnotes
        ]
        schema['agent'] = {
            'role': [
                not_empty, va.check_agent_fields, va.validate_general, unicode,
                co.flattened_to_extras
            ],
            'name': [
                ignore_empty, va.validate_general, unicode,
                va.contains_alphanumeric, co.flattened_to_extras
            ],
            'id': [
                ignore_empty, va.validate_general, unicode,
                co.flattened_to_extras
            ],
            'organisation': [
                ignore_empty, va.validate_general, unicode,
                va.contains_alphanumeric, co.flattened_to_extras
            ],
            'URL': [
                ignore_empty, co.remove_trailing_spaces, url_validator,
                va.validate_general, unicode, co.flattened_to_extras
            ],
            'fundingid': [
                ignore_empty, va.validate_general, unicode,
                co.flattened_to_extras
            ]
        }
        schema['contact'] = {
            'name': [
                not_empty, va.validate_general, unicode,
                va.contains_alphanumeric, co.flattened_to_extras
            ],
            'email': [
                not_empty, co.remove_trailing_spaces, unicode,
                va.validate_email, co.flattened_to_extras
            ],
            'URL': [
                ignore_empty, co.remove_trailing_spaces, url_validator,
                va.validate_general, unicode, co.flattened_to_extras
            ],
            # phone number can be missing from the first users
            'phone': [
                ignore_missing, co.remove_trailing_spaces, unicode,
                va.validate_phonenum, co.flattened_to_extras
            ]
        }
        schema['event'] = {
            'type': [
                ignore_missing, va.check_events, unicode,
                co.flattened_to_extras, va.validate_general
            ],
            'who': [
                ignore_missing, unicode, co.flattened_to_extras,
                va.validate_general, va.contains_alphanumeric
            ],
            'when': [
                ignore_missing, unicode, co.flattened_to_extras,
                va.validate_kata_interval_date
            ],
            'descr': [
                ignore_missing, unicode, co.flattened_to_extras,
                va.validate_general, va.contains_alphanumeric
            ]
        }
        schema['id'] = [not_empty, va.validate_package_id_format, unicode]

        # Langtitle fields are used by the UI, to construct a 'title' field with translations in JSON format
        # This is not necessarily needed for the API calls
        schema['langtitle'] = {
            'value': [
                unicode, va.validate_title, va.validate_title_duplicates,
                co.escape_quotes
            ],
            'lang': [unicode, co.convert_languages]
        }

        # The title field contains all the title translations in JSON format.
        # The converter gen_translation_str_from_langtitle
        # needs to be called to construct the JSON string from the UI's langtitle fields.
        schema['title'] = [va.not_empty_if_langtitle_empty]

        # Description (notes) is a multilanguage field similar to title
        schema['langnotes'] = {
            'value': [unicode, va.validate_notes_duplicates, co.escape_quotes],
            'lang': [unicode, co.convert_languages]
        }
        schema['notes'] = [ignore_empty]

        schema['language'] = \
            [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode]
        schema['license_id'] = [co.to_license_id, unicode]
        schema['temporal_coverage_begin'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['temporal_coverage_end'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['pids'] = {
            'provider': [ignore_missing, unicode, co.flattened_to_extras],
            'id': [
                not_empty, va.validate_general,
                va.validate_primary_pid_uniqueness, unicode,
                co.flattened_to_extras
            ],
            'type': [
                not_missing, co.remove_trailing_spaces, va.validate_pid_type,
                unicode, co.flattened_to_extras
            ],
            'relation': [
                ignore_missing, co.remove_trailing_spaces, co.to_relation,
                va.validate_pid_relation_type, unicode, co.flattened_to_extras
            ]
        }
        schema['tag_string'] = [
            ignore_missing, not_empty, va.kata_tag_string_convert
        ]
        # otherwise the tags would be validated with default tag validator during update
        schema['tags'] = cls.tags_schema()
        schema['xpaths'] = [ignore_missing, co.to_extras_json]
        schema['version'] = [not_empty, unicode, va.validate_kata_date]
        schema['availability'] = [
            not_missing, va.validate_availability, co.convert_to_extras_kata
        ]
        schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata]
        schema['__extras'] = [
            va.check_agent, va.check_contact, va.check_langtitle
        ]
        schema['__junk'] = [va.check_junk]
        schema['name'] = [
            va.continue_if_missing, co.default_name_from_id, unicode,
            package_name_validator, va.validate_general
        ]
        schema['external_id'] = [
            ignore_missing, co.remove_trailing_spaces, co.convert_external_id,
            va.validate_external_id_uniqueness, unicode, va.validate_general,
            co.convert_to_extras_kata
        ]
        schema['access_application_download_URL'] = [
            ignore_missing, co.remove_trailing_spaces,
            va.validate_access_application_download_url, unicode,
            va.validate_general, co.convert_to_extras_kata
        ]
        schema['access_application_URL'] = [
            ignore_missing, co.remove_trailing_spaces,
            va.validate_access_application_url, unicode, va.validate_general,
            co.convert_to_extras_kata
        ]
        schema['access_request_URL'] = [
            ignore_missing, co.remove_trailing_spaces,
            va.check_access_request_url, url_validator, unicode,
            va.validate_general, co.convert_to_extras_kata
        ]
        schema['discipline'] = [
            ignore_missing, va.validate_discipline, co.convert_to_extras_kata,
            unicode
        ]
        schema['geographic_coverage'] = [
            ignore_missing, va.validate_spatial, co.convert_to_extras_kata,
            unicode
        ]
        schema['license_URL'] = [
            va.continue_if_missing, va.validate_license_url,
            co.populate_license_URL_if_license_id_not_resolved,
            co.convert_to_extras_kata, unicode, va.validate_general
        ]
        schema['owner_org'] = [va.kata_owner_org_validator, unicode]
        schema['resources']['url'] = [
            default(settings.DATASET_URL_UNKNOWN),
            va.check_resource_url_for_direct_download_url, unicode,
            va.validate_general
        ]
        # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource()
        schema['resources']['algorithm'] = [
            ignore_missing, unicode, va.validate_algorithm
        ]
        schema['resources']['format'] = [
            ignore_missing, unicode, va.validate_general
        ]
        schema['resources']['hash'].append(va.validate_general)
        schema['resources']['mimetype'].append(va.validate_mimetype)

        return schema
 def create_package_schema(self):
     return default_schema.default_create_package_schema()
Example #37
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                api_version = self.config.get('api_version', '2')
                #TODO: use site user when available
                user_name = self.config.get('user', u'harvest')
            else:
                api_version = '2'
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
            tags = list(set(tags))
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context,
                                                                   data_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    new_package = get_action('package_update_rest')(
                        context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' %
                             harvest_object.guid)
                    return

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Check if name has not already been used
                package_dict['name'] = self._check_name(package_dict['name'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                new_package = get_action('package_create_rest')(context,
                                                                package_dict)
                harvest_object.package_id = new_package['id']

            # Flag the other objects linking to this package as not current anymore
            from ckanext.harvest.model import harvest_object_table
            conn = Session.connection()
            u = update(harvest_object_table) \
                    .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                    .values(current=False)
            conn.execute(u, b_package_id=new_package['id'])
            Session.commit()

            # Flag this as the current harvest object

            harvest_object.package_id = new_package['id']
            harvest_object.current = True
            harvest_object.save()

            return True

        except ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Example #38
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags
            
            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                            existing_package_dict['name'])
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Example #39
0
    def _create_or_update_package(self, package_dict, harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form == 'package_show'
                        else 'package_update_rest')(context, package_dict)

                else:
                    log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form == 'package_show'
                    else 'package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Example #40
0
    def test_theme_to_group_mapping(self):
        # multilang requires lang to be set
        # class dummyreq(object):
        #     class p(object):
        #         translator = object()
        #     environ = {'pylons.pylons': p()}

        # CKANRequest(dummyreq)
        # pylons.request = dummyreq()
        # pylons.translator.pylons_lang = ['en_GB']

        #set_lang('en_GB')
        #assert get_lang() == ['en_GB']
        assert 'dcatapit_theme_group_mapper' in config[
            'ckan.plugins'], 'No dcatapit_theme_group_mapper plugin in config'

        with open(get_example_file('dataset.rdf'), 'r') as f:
            contents = f.read()

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        self.assertEqual(len(datasets), 1)
        package_dict = datasets[0]

        user = User.get('dummy')

        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org = call_action('organization_create',
                              context={'user': user_name},
                              name='dummy',
                              identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g = call_action('group_create',
                                     context={'user': user_name},
                                     name='existing-group')

        context = {'user': '******', 'ignore_auth': True, 'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {
            'frequency': 'manual',
            'publisher_name': 'dummy',
            'extras': [{
                'key': 'theme',
                'value': ['non-mappable', 'thememap1']
            }],
            'groups': [],  #  [{'name':existing_g.name}],
            'title': 'dummy',
            'holder_name': 'dummy',
            'holder_identifier': 'dummy',
            'name': 'dummy-' + uuid4().hex,
            'identifier': 'dummy' + uuid4().hex,
            'notes': 'dummy',
            'owner_org': 'dummy',
            'modified': datetime.now(),
            'publisher_identifier': 'dummy',
            'metadata_created': datetime.now(),
            'metadata_modified': datetime.now(),
            'guid': str(uuid.uuid4),
        }

        package_dict.update(_p)

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        package_data = call_action('package_create',
                                   context=context,
                                   **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {
            'theme': ['non-mappable', 'thememap1']
        } == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(
            group_type='group'), 'should be {}, got {}'.format(
                [], p.get_groups(group_type='group'))

        package_data = call_action('package_show',
                                   context=context,
                                   id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..',
                                     '..', 'examples', 'test_map.ini')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'false'

        # package_dict['theme'] = ['non-mappable', 'thememap1']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group, and 2 other unexisting groups that will not be created
        expected_groups = ['existing-group']
        self.assertSetEqual(set(expected_groups), set(groups),
                            'Error in assigned groups')

        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file
        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'

        # package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # the map file maps ECON to existing group and 2 other groups that have been automatically created
        expected_groups = expected_groups + ['somegroup1', 'somegroup2']
        self.assertSetEqual(set(expected_groups), set(groups), 'Groups differ')

        # package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        aggr = json.loads(package_dict[FIELD_THEMES_AGGREGATE])
        aggr.append({'theme': 'thememap-multi', 'subthemes': []})
        package_dict[FIELD_THEMES_AGGREGATE] = json.dumps(aggr)

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        # added theme 'thememap-multi', that maps to 'othergroup' and other already exisintg groups
        expected_groups = expected_groups + ['othergroup']
        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)

        meta.Session.flush()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        self.assertEqual(len(expected_groups), len(groups),
                         'New groups differ - there may be duplicated groups')
        self.assertSetEqual(set(expected_groups), set(groups),
                            'New groups differ')

        meta.Session.rollback()
def create_package_schema():
    schema = default_create_package_schema()
    _modify_schema(schema)
    return schema
Example #42
0
    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form ==
                        'package_show' else 'package_update_rest')(
                            context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Example #43
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                            existing_package_dict['name'])
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Example #44
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')

                #TODO: use site user when available
                user_name = self.config.get('user', self._get_user_name())
            else:
                api_version = 2
                user_name = self._get_user_name()

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            tags = package_dict.get('tags', [])
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context,
                                                                   data_dict)
                log.info('Package with GUID %s not updated, skipping...' %
                         harvest_object.guid)

            except NotFound:
                # Package needs to be created

                # Check if name has not already been used
                package_dict['name'] = self._gen_new_name(
                    package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context,
                                                                package_dict)

            Session.commit()

            return True

        except ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Example #45
0
    def test_1_package_schema(self):
        pkg = (
            model.Session.query(model.Package)
            .filter_by(name="annakarenina")
            .first()
        )

        package_id = pkg.id
        result = package_dictize(pkg, self.context)
        self.remove_changable_columns(result)

        result["name"] = "anna2"
        # we need to remove these as they have been added
        del result["relationships_as_object"]
        del result["relationships_as_subject"]

        converted_data, errors = validate(
            result, default_create_package_schema(), self.context
        )

        expected_data = {
            "extras": [
                {"key": u"genre", "value": u"romantic novel"},
                {"key": u"original media", "value": u"book"},
            ],
            "groups": [
                {u"name": u"david", u"title": u"Dave's books"},
                {u"name": u"roger", u"title": u"Roger's books"},
            ],
            "license_id": u"other-open",
            "name": u"anna2",
            "type": u"dataset",
            "notes": u"Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n",
            "private": False,
            "resources": [
                {
                    "alt_url": u"alt123",
                    "description": u'Full text. Needs escaping: " Umlaut: \xfc',
                    "format": u"plain text",
                    "hash": u"abc123",
                    "size_extra": u"123",
                    "url": u"http://datahub.io/download/x=1&y=2",
                },
                {
                    "alt_url": u"alt345",
                    "description": u"Index of the novel",
                    "format": u"JSON",
                    "hash": u"def456",
                    "size_extra": u"345",
                    "url": u"http://datahub.io/index.json",
                },
            ],
            "tags": [
                {"name": u"Flexible \u30a1"},
                {"name": u"russian"},
                {"name": u"tolstoy"},
            ],
            "title": u"A Novel By Tolstoy",
            "url": u"http://datahub.io",
            "version": u"0.7a",
        }

        assert converted_data == expected_data, pformat(converted_data)
        assert not errors, errors

        data = converted_data
        data["name"] = u"annakarenina"
        data.pop("title")
        data["resources"][0]["url"] = "fsdfafasfsaf"
        data["resources"][1].pop("url")

        converted_data, errors = validate(
            data, default_create_package_schema(), self.context
        )

        assert errors == {"name": [u"That URL is already in use."]}, pformat(
            errors
        )

        data["id"] = package_id
        data["name"] = "????jfaiofjioafjij"

        converted_data, errors = validate(
            data, default_update_package_schema(), self.context
        )
        assert errors == {
            "name": [
                u"Must be purely lowercase alphanumeric (ascii) "
                "characters and these symbols: -_"
            ]
        }, pformat(errors)
Example #46
0
File: base.py Project: tbalaz/test
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                api_version = self.config.get('api_version','2')
                #TODO: use site user when available
                user_name = self.config.get('user',u'harvest')
            else:
                api_version = '2'
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
            tags = list(set(tags))
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Check if name has not already been used
                package_dict['name'] = self._check_name(package_dict['name'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                new_package = get_action('package_create_rest')(context, package_dict)
                harvest_object.package_id = new_package['id']

            # Flag the other objects linking to this package as not current anymore
            from ckanext.harvest.model import harvest_object_table
            conn = Session.connection()
            u = update(harvest_object_table) \
                    .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                    .values(current=False)
            conn.execute(u, b_package_id=new_package['id'])
            Session.commit()

            # Flag this as the current harvest object

            harvest_object.package_id = new_package['id']
            harvest_object.current = True
            harvest_object.save()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Example #47
0
    def test_package_schema(self):
        group1 = factories.Group(title="Dave's books")
        group2 = factories.Group(title="Roger's books")
        first_name = factories.Dataset.stub().name
        second_name = factories.Dataset.stub().name
        expected_data = {
            "extras": [
                {"key": u"genre", "value": u"romantic novel"},
                {"key": u"original media", "value": u"book"},
            ],
            "groups": [
                {u"name": group1["name"], u"title": group1["title"]},
                {u"name": group2["name"], u"title": group2["title"]},
            ],
            "license_id": u"other-open",
            "name": first_name,
            "type": u"dataset",
            "notes": u"Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n",
            "private": False,
            "resources": [
                {
                    "alt_url": u"alt123",
                    "description": u'Full text. Needs escaping: " Umlaut: \xfc',
                    "format": u"plain text",
                    "hash": u"abc123",
                    "size_extra": u"123",
                    "url": u"http://datahub.io/download/x=1&y=2",
                },
                {
                    "alt_url": u"alt345",
                    "description": u"Index of the novel",
                    "format": u"JSON",
                    "hash": u"def456",
                    "size_extra": u"345",
                    "url": u"http://datahub.io/index.json",
                },
            ],
            "tags": sorted([
                {"name": factories.Tag.stub().name},
                {"name": factories.Tag.stub().name},
                {"name": factories.Tag.stub().name},
            ], key=operator.itemgetter("name")),
            "title": u"A Novel By Tolstoy",
            "url": u"http://datahub.io",
            "version": u"0.7a",
            "relationships_as_subject": [],
            "relationships_as_object": [],
        }

        context = {"model": model, "session": model.Session}
        pkg = factories.Dataset.model(**expected_data)

        package_id = pkg.id
        result = package_dictize(pkg, context)
        self.remove_changable_columns(result)

        result["name"] = second_name
        expected_data["name"] = second_name
        converted_data, errors = validate(
            result, default_create_package_schema(), context
        )

        assert converted_data == expected_data, pformat(converted_data)
        assert not errors, errors

        data = converted_data
        data["name"] = first_name
        data.pop("title")
        data["resources"][0]["url"] = "fsdfafasfsaf"
        data["resources"][1].pop("url")

        converted_data, errors = validate(
            data, default_create_package_schema(), context
        )

        assert errors == {"name": [u"That URL is already in use."]}, pformat(
            errors
        )

        data["id"] = package_id
        data["name"] = "????jfaiofjioafjij"

        converted_data, errors = validate(
            data, default_update_package_schema(), context
        )
        assert errors == {
            "name": [
                u"Must be purely lowercase alphanumeric (ascii) "
                "characters and these symbols: -_"
            ]
        }, pformat(errors)
    def test_1_package_schema(self):
        pkg = model.Session.query(model.Package)\
            .filter_by(name='annakarenina')\
            .first()

        package_id = pkg.id
        result = package_dictize(pkg, self.context)
        self.remove_changable_columns(result)

        result['name'] = 'anna2'
        # we need to remove these as they have been added
        del result['relationships_as_object']
        del result['relationships_as_subject']

        converted_data, errors = validate(result,
                                          default_create_package_schema(),
                                          self.context)

        expected_data = {
            'extras': [{'key': u'genre', 'value': u'romantic novel'},
                       {'key': u'original media', 'value': u'book'}],
            'groups': [{u'name': u'david',
                        u'title': u"Dave's books"},
                       {u'name': u'roger',
                        u'title': u"Roger's books"}],
            'license_id': u'other-open',
            'name': u'anna2',
            'type': u'dataset',
            'notes': u'Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n',
            'private': False,
            'resources': [{'alt_url': u'alt123',
                           'description': u'Full text. Needs escaping: " Umlaut: \xfc',
                           'format': u'plain text',
                           'hash': u'abc123',
                           'size_extra': u'123',
                           'url': u'http://datahub.io/download/x=1&y=2'},
                          {'alt_url': u'alt345',
                           'description': u'Index of the novel',
                           'format': u'JSON',
                           'hash': u'def456',
                           'size_extra': u'345',
                           'url': u'http://datahub.io/index.json'}],
            'tags': [{'name': u'Flexible \u30a1'},
                     {'name': u'russian'},
                     {'name': u'tolstoy'}],
            'title': u'A Novel By Tolstoy',
            'url': u'http://datahub.io',
            'version': u'0.7a'
        }

        assert converted_data == expected_data, pformat(converted_data)
        assert not errors, errors

        data = converted_data
        data['name'] = u'annakarenina'
        data.pop("title")
        data["resources"][0]["url"] = 'fsdfafasfsaf'
        data["resources"][1].pop("url")

        converted_data, errors = validate(data,
                                          default_create_package_schema(),
                                          self.context)

        assert errors == {
            'name': [u'That URL is already in use.'],
            'resources': [{}, {'url': [u'Missing value']}]
        }, pformat(errors)

        data["id"] = package_id

        converted_data, errors = validate(data,
                                          default_update_package_schema(),
                                          self.context)

        assert errors == {
            'resources': [{}, {'url': [u'Missing value']}]
        }, pformat(errors)

        data['name'] = '????jfaiofjioafjij'

        converted_data, errors = validate(data,
                                          default_update_package_schema(),
                                          self.context)
        assert errors == {
            'name': [u'Must be purely lowercase alphanumeric (ascii) '
                     'characters and these symbols: -_'],
            'resources': [{}, {'url': [u'Missing value']}]
        }, pformat(errors)
Example #49
0
 def create_package_schema(self):
     return default_schema.default_create_package_schema()
Example #50
0
RESOURCE_FIELDS = [
    'name',
    'resource_type',
    'url',
    'size',
    'format',
    'language',
    ]

EXISTING_RESOURCE_FIELDS = set(default_resource_schema())

BILINGUAL_RESOURCE_FIELDS = set([
    'name',
    ])

EXISTING_FIELDS = set(default_create_package_schema()
    ) | set(['spatial'])

# The field order here must match the proposed schema spreadsheet
ProposedField = namedtuple("ProposedField", """
    class_
    sub_class
    property_name
    property_label
    iso_multiplicity
    property_name_fra
    property_label_fra
    gc_multiplicity
    type_
    ckan_type
    description
Example #51
0
RESOURCE_FIELDS = [
    'name',
    'resource_type',
    'url',
    'size',
    'format',
    'language',
]

EXISTING_RESOURCE_FIELDS = set(default_resource_schema())

BILINGUAL_RESOURCE_FIELDS = set([
    'name',
])

EXISTING_FIELDS = set(default_create_package_schema()) | set(['spatial'])

# The field order here must match the proposed schema spreadsheet
ProposedField = namedtuple(
    "ProposedField", """
    class_
    sub_class
    property_name
    property_label
    iso_multiplicity
    property_name_fra
    property_label_fra
    gc_multiplicity
    type_
    ckan_type
    description
Example #52
0
 def create_package_schema(self):
     schema = default_create_package_schema()
     schema.update({
         'vocab_tags': [ignore_missing, convert_to_tags(TEST_VOCAB_NAME)],
     })
     return schema
    def test_1_package_schema(self):
        pkg = model.Session.query(model.Package)\
            .filter_by(name='annakarenina')\
            .first()

        package_id = pkg.id
        result = package_dictize(pkg, self.context)
        self.remove_changable_columns(result)

        result['name'] = 'anna2'
        # we need to remove these as they have been added
        del result['relationships_as_object']
        del result['relationships_as_subject']

        converted_data, errors = validate(result,
                                          default_create_package_schema(),
                                          self.context)

        expected_data = {
            'extras': [{
                'key': u'genre',
                'value': u'romantic novel'
            }, {
                'key': u'original media',
                'value': u'book'
            }],
            'groups': [{
                u'name': u'david',
                u'title': u"Dave's books"
            }, {
                u'name': u'roger',
                u'title': u"Roger's books"
            }],
            'license_id':
            u'other-open',
            'name':
            u'anna2',
            'type':
            u'dataset',
            'notes':
            u'Some test notes\n\n### A 3rd level heading\n\n**Some bolded text.**\n\n*Some italicized text.*\n\nForeign characters:\nu with umlaut \xfc\n66-style quote \u201c\nforeign word: th\xfcmb\n\nNeeds escaping:\nleft arrow <\n\n<http://ckan.net/>\n\n',
            'private':
            False,
            'resources': [{
                'alt_url':
                u'alt123',
                'description':
                u'Full text. Needs escaping: " Umlaut: \xfc',
                'format':
                u'plain text',
                'hash':
                u'abc123',
                'size_extra':
                u'123',
                'url':
                u'http://www.annakarenina.com/download/x=1&y=2'
            }, {
                'alt_url': u'alt345',
                'description': u'Index of the novel',
                'format': u'JSON',
                'hash': u'def456',
                'size_extra': u'345',
                'url': u'http://www.annakarenina.com/index.json'
            }],
            'tags': [{
                'name': u'Flexible \u30a1'
            }, {
                'name': u'russian'
            }, {
                'name': u'tolstoy'
            }],
            'title':
            u'A Novel By Tolstoy',
            'url':
            u'http://www.annakarenina.com',
            'version':
            u'0.7a'
        }

        assert converted_data == expected_data, pformat(converted_data)
        assert not errors, errors

        data = converted_data
        data['name'] = u'annakarenina'
        data.pop("title")
        data["resources"][0]["url"] = 'fsdfafasfsaf'
        data["resources"][1].pop("url")

        converted_data, errors = validate(data,
                                          default_create_package_schema(),
                                          self.context)

        assert errors == {
            'name': [u'That URL is already in use.'],
            'resources': [{}, {
                'url': [u'Missing value']
            }]
        }, pformat(errors)

        data["id"] = package_id

        converted_data, errors = validate(data,
                                          default_update_package_schema(),
                                          self.context)

        assert errors == {
            'resources': [{}, {
                'url': [u'Missing value']
            }]
        }, pformat(errors)

        data['name'] = '????jfaiofjioafjij'

        converted_data, errors = validate(data,
                                          default_update_package_schema(),
                                          self.context)
        assert errors == {
            'name': [
                u'Url must be purely lowercase alphanumeric (ascii) '
                'characters and these symbols: -_'
            ],
            'resources': [{}, {
                'url': [u'Missing value']
            }]
        }, pformat(errors)
    def test_mapping(self):

        # multilang requires lang to be set
        from pylons.i18n.translation import set_lang, get_lang
        import pylons
        class dummyreq(object):
            class p(object):
                translator = object()
            environ = {'pylons.pylons': p()}
        pylons.request = dummyreq()
        pylons.translator.pylons_lang = ['en_GB']
        set_lang('en_GB')
        assert get_lang() == ['en_GB']

        assert 'dcatapit_theme_group_mapper' in config['ckan.plugins'], "No dcatapit_theme_group_mapper plugin in config"
        contents = self._get_file_contents('dataset.rdf')

        p = RDFParser(profiles=['it_dcat_ap'])

        p.parse(contents)
        datasets = [d for d in p.datasets()]
        eq_(len(datasets), 1)
        package_dict = datasets[0]


        user = User.get('dummy')
        
        if not user:
            user = call_action('user_create',
                               name='dummy',
                               password='******',
                               email='*****@*****.**')
            user_name = user['name']
        else:
            user_name = user.name
        org = Group.by_name('dummy')
        if org is None:
            org  = call_action('organization_create',
                                context={'user': user_name},
                                name='dummy',
                                identifier='aaaaaa')
        existing_g = Group.by_name('existing-group')
        if existing_g is None:
            existing_g  = call_action('group_create',
                                      context={'user': user_name},
                                      name='existing-group')

        context = {'user': '******',
                   'ignore_auth': True,
                   'defer_commit': False}
        package_schema = schema.default_create_package_schema()
        context['schema'] = package_schema
        _p = {'frequency': 'manual',
              'publisher_name': 'dummy',
              'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
              'groups': [],
              'title': 'dummy',
              'holder_name': 'dummy',
              'holder_identifier': 'dummy',
              'name': 'dummy',
              'notes': 'dummy',
              'owner_org': 'dummy',
              'modified': datetime.now(),
              'publisher_identifier': 'dummy',
              'metadata_created' : datetime.now(),
              'metadata_modified': datetime.now(),
              'guid': unicode(uuid.uuid4),
              'identifier': 'dummy'}
        
        package_dict.update(_p)
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = ''
        package_data = call_action('package_create', context=context, **package_dict)

        p = Package.get(package_data['id'])

        # no groups should be assigned at this point (no map applied)
        assert {'theme': ['non-mappable', 'thememap1']} == p.extras, '{} vs {}'.format(_p['extras'], p.extras)
        assert [] == p.get_groups(group_type='group'), 'should be {}, got {}'.format([], p.get_groups(group_type='group'))

        package_data = call_action('package_show', context=context, id=package_data['id'])

        # use test mapping, which replaces thememap1 to thememap2 and thememap3
        test_map_file = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'examples', 'test_map.ini')
        config[DCATAPIT_THEME_TO_MAPPING_SOURCE] = test_map_file

        package_dict['theme'] = ['non-mappable', 'thememap1']

        expected_groups_existing = ['existing-group']
        expected_groups_new = expected_groups_existing + ['somegroup1', 'somegroup2']
        expected_groups_multi = expected_groups_new + ['othergroup']

        package_dict.pop('extras', None)
        p = Package.get(package_data['id'])
        context['package'] = p 

        package_data = call_action('package_update',
                                   context=context,
                                   **package_dict)
        
        #meta.Session.flush()
        #meta.Session.revision = repo.new_revision()

        # check - only existing group should be assigned
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert expected_groups_existing == groups, (expected_groups_existing, 'vs', groups,)

        config[DCATAPIT_THEME_TO_MAPPING_ADD_NEW_GROUPS] = 'true'


        package_dict['theme'] = ['non-mappable', 'thememap1']
        package_data = call_action('package_update', context=context, **package_dict)


        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - this time, new groups should appear
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_new) == len(groups), (expected_groups_new, 'vs', groups,)
        assert set(expected_groups_new) == set(groups), (expected_groups_new, 'vs', groups,)

        package_dict['theme'] = ['non-mappable', 'thememap1', 'thememap-multi']
        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        package_data = call_action('package_update', context=context, **package_dict)

        meta.Session.flush()
        meta.Session.revision = repo.new_revision()

        # recheck - there still should be no duplicates
        p = Package.get(package_data['id'])
        groups = [g.name for g in p.get_groups(group_type='group')]

        assert len(expected_groups_multi) == len(groups), (expected_groups_multi, 'vs', groups,)
        assert set(expected_groups_multi) == set(groups), (expected_groups_multi, 'vs', groups,)

        meta.Session.rollback()
Example #55
0
 def create_package_schema(self) -> Schema:
     return schema.default_create_package_schema()
Example #56
0
    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.
        The package dictionary can be in one of two forms:
        1. 'rest' - as seen on the RESTful API:
                http://datahub.io/api/rest/dataset/1996_population_census_data_canada
           This is the legacy form. It is the default to provide backward
           compatibility.
           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']
        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):
               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada
           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]
        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).
        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].
        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.
        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, six.text_type]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                package_dict['tags'] = self._clean_tags(tags)

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if 'metadata_modified' not in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified') or package_dict['name'] == "status-of-covid-19-cases-in-ontario-by-public-health-unit-phu" or package_dict['id'] == 'ecb75ea0-8b72-4f46-a14a-9bd54841d6ab':
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])
                    '''
                    	what we want to do here is
                    		- not overwrite maintainer name or maintainer email or maintainer branch with blank information
                    		- not include resources because it will overwrite the existing resources
                            - match owner_org
                            - not overwrite all keywords (just add)
                    '''

                    package_dict['keywords'] = {
                        "en":
                        list(
                            set(existing_package_dict['keywords']['en'] +
                                package_dict['keywords']['en'])),
                        "fr":
                        list(
                            set(existing_package_dict['keywords']['fr'] +
                                package_dict['keywords']['fr']))
                    }
                    package_dict['owner_org'] = package_dict['organization'][
                        'name']
                    package_dict['harvester'] = "ontario-data-catalogue"
                    if package_dict.get("maintainer_email", "") == "":
                        del package_dict['maintainer_email']
                    if "maintainer_translated" in package_dict:
                        if package_dict['maintainer_translated'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_translated'].get(
                                "fr", "") == "":
                            del package_dict['maintainer_translated']
                        elif package_dict['maintainer_translated'].get(
                                "en", ""
                        ) != "" and package_dict['maintainer_translated'].get(
                                "fr", "") == "":
                            package_dict['maintainer_translated'][
                                'fr'] = package_dict['maintainer_translated'][
                                    'en']
                        elif package_dict['maintainer_translated'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_translated'].get(
                                "fr", "") != "":
                            package_dict['maintainer_translated'][
                                'en'] = package_dict['maintainer_translated'][
                                    'fr']
                    if "maintainer_branch" in package_dict:
                        if package_dict['maintainer_branch'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_branch'].get(
                                "fr", "") == "":
                            del package_dict['maintainer_branch']

                    if 'resources' in package_dict:
                        for resource in package_dict['resources']:
                            resource.update({"harvested_resource": True})
                            resource_context = {
                                'model': model,
                                'session': Session,
                                'user': user_name,
                                'api_version': api_version,
                                'id': resource['id'],
                                'ignore_auth': True,
                            }
                            p.toolkit.get_action(
                                "resource_patch" if resource['id'] in list(
                                    map(lambda x: x["id"],
                                        existing_package_dict["resources"])
                                ) else "resource_create")(resource_context,
                                                          resource)
                        list_of_remote_resources = list(
                            map(lambda x: x["id"], package_dict["resources"]))
                        for resource in list(
                                filter(
                                    lambda x: x["harvested_resource"] == True,
                                    existing_package_dict["resources"])):
                            # if there's a harvested resource locally that isn't in the latest harvested list of resources, delete it
                            if resource['id'] not in list_of_remote_resources:
                                resource_context = {
                                    'model': model,
                                    'session': Session,
                                    'user': user_name,
                                    'api_version': api_version,
                                    'id': resource['id'],
                                    'ignore_auth': True,
                                }
                                p.toolkit.get_action("resource_delete")(
                                    resource_context, {
                                        'id': resource['id']
                                    })

                        del package_dict['resources']
                    new_package = p.toolkit.get_action("package_patch")(
                        context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table)\
                    .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \
                    .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                package_dict['owner_org'] = package_dict['organization'][
                    'name']
                package_dict['harvester'] = "ontario-data-catalogue"
                for resource in package_dict['resources']:
                    resource.update({"harvested_resource": True})

                if package_dict.get("maintainer_email", "") == "":
                    package_dict['maintainer_email'] = "*****@*****.**"
                if "maintainer_translated" in package_dict:
                    if package_dict['maintainer_translated'].get(
                            "en", ""
                    ) == "" and package_dict['maintainer_translated'].get(
                            "fr", "") == "":
                        package_dict['maintainer_translated'] = {
                            "en": "Open Data",
                            "fr": "Données ouvertes"
                        }
                    elif package_dict['maintainer_translated'].get(
                            "en", ""
                    ) != "" and package_dict['maintainer_translated'].get(
                            "fr", "") == "":
                        package_dict['maintainer_translated'][
                            'fr'] = package_dict['maintainer_translated']['en']
                    elif package_dict['maintainer_translated'].get(
                            "en", ""
                    ) == "" and package_dict['maintainer_translated'].get(
                            "fr", "") != "":
                        package_dict['maintainer_translated'][
                            'en'] = package_dict['maintainer_translated']['fr']
                else:
                    package_dict['maintainer_translated'] = {
                        "en": "Open Data",
                        "fr": "Données ouvertes"
                    }
                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError as e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
        except Exception as e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')

        return None
Example #57
0
    def _create_package_schema(cls):
        """ Create common schema for dataset create and update.
        """
        # TODO: MIKKO: Use the general converter for lang_title and check that lang_title exists!
        # Note: harvester schemas

        schema = default_create_package_schema()
        schema.pop('author')

        for key in settings.KATA_FIELDS_REQUIRED:
            schema[key] = [not_empty, co.convert_to_extras_kata, unicode, va.validate_general]
        for key in settings.KATA_FIELDS_RECOMMENDED:
            schema[key] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general]

        schema['agent'] = {'role': [not_empty, va.check_agent_fields, va.validate_general, unicode, co.flattened_to_extras],
                           'name': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                           'id': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras],
                           'organisation': [ignore_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                           'URL': [ignore_empty, url_validator, va.validate_general, unicode, co.flattened_to_extras],
                           'fundingid': [ignore_empty, va.validate_general, unicode, co.flattened_to_extras]}
        schema['contact'] = {'name': [not_empty, va.validate_general, unicode, va.contains_alphanumeric, co.flattened_to_extras],
                             'email': [not_empty, unicode, va.validate_email, co.flattened_to_extras],
                             'URL': [ignore_empty, url_validator, va.validate_general, unicode, co.flattened_to_extras],
                             # phone number can be missing from the first users
                             'phone': [ignore_missing, unicode, va.validate_phonenum, co.flattened_to_extras]}
        # phone number can be missing from the first users
        # schema['contact_phone'] = [ignore_missing, validate_phonenum, convert_to_extras_kata, unicode]
        # schema['contact_URL'] = [ignore_missing, url_validator, convert_to_extras_kata, unicode, validate_general]
        schema['event'] = {'type': [ignore_missing, va.check_events, unicode, co.flattened_to_extras, va.validate_general],
                           'who': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric],
                           'when': [ignore_missing, unicode, co.flattened_to_extras, va.validate_kata_date],
                           'descr': [ignore_missing, unicode, co.flattened_to_extras, va.validate_general, va.contains_alphanumeric]}
        schema['id'] = [default(u''), co.update_pid, unicode]
        schema['langtitle'] = {'value': [not_missing, unicode, va.validate_title, va.validate_title_duplicates, co.ltitle_to_extras],
                               'lang': [not_missing, unicode, co.convert_languages]}
        schema['language'] = \
            [ignore_missing, co.convert_languages, co.remove_disabled_languages, co.convert_to_extras_kata, unicode]
        schema['temporal_coverage_begin'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['temporal_coverage_end'] = \
            [ignore_missing, va.validate_kata_date, co.convert_to_extras_kata, unicode]
        schema['pids'] = {'provider': [ignore_missing, unicode, co.flattened_to_extras],
                          'id': [not_empty, va.validate_general, unicode, co.flattened_to_extras],
                          'type': [not_missing, unicode, co.flattened_to_extras],
                          'primary': [ignore_missing, unicode, co.flattened_to_extras]}
        schema['tag_string'] = [ignore_missing, not_empty, va.kata_tag_string_convert]
        # otherwise the tags would be validated with default tag validator during update
        schema['tags'] = cls.tags_schema()
        schema['xpaths'] = [ignore_missing, co.to_extras_json]
        # these two can be missing from the first Kata end users
        # TODO: version date validation should be tighter, see metadata schema
        schema['version'] = [not_empty, unicode, va.validate_kata_date]
        schema['availability'] = [not_missing, co.convert_to_extras_kata]
        schema['langdis'] = [co.checkbox_to_boolean, co.convert_to_extras_kata]
        # TODO: MIKKO: __extras: check_langtitle needed? Its 'raise' seems to be unreachable
        schema['__extras'] = [va.check_agent, va.check_langtitle, va.check_contact, va.check_pids]
        schema['__junk'] = [va.check_junk]
        schema['name'] = [ignore_missing, unicode, co.default_name_from_id, package_name_validator,
                          va.validate_general]
        schema['access_application_download_URL'] = [ignore_missing, va.validate_access_application_download_url,
                                                     unicode, va.validate_general, co.convert_to_extras_kata]
        schema['access_application_new_form'] = [co.checkbox_to_boolean, co.convert_to_extras_kata,
                                                 co.remove_access_application_new_form]
        schema['access_application_URL'] = [ignore_missing, va.validate_access_application_url,
                                            unicode, va.validate_general, co.convert_to_extras_kata]
        schema['access_request_URL'] = [ignore_missing, va.check_access_request_url, url_validator,
                                        unicode, va.validate_general, co.convert_to_extras_kata]
        schema['through_provider_URL'] = [ignore_missing, va.check_through_provider_url, url_validator,
                                          unicode, va.validate_general, co.convert_to_extras_kata]
        schema['discipline'] = [ignore_missing, va.validate_discipline, co.convert_to_extras_kata, unicode]
        schema['geographic_coverage'] = [ignore_missing, va.validate_spatial, co.convert_to_extras_kata, unicode]
        schema['license_URL'] = [ignore_missing, co.convert_to_extras_kata, unicode, va.validate_general]
        schema['owner_org'] = [ignore_missing, va.kata_owner_org_validator, unicode]
        schema['resources']['url'] = [default(settings.DATASET_URL_UNKNOWN), va.check_direct_download_url,
                                      unicode, va.validate_general]
        # Conversion (and validation) of direct_download_URL to resource['url'] is in utils.py:dataset_to_resource()
        schema['resources']['algorithm'] = [ignore_missing, unicode, va.validate_algorithm]
        schema['resources']['hash'].append(va.validate_general)
        schema['resources']['mimetype'].append(va.validate_mimetype)
        return schema