def test_job_local_paths_are_hidden(self, mock_open):

        invalid_csv = 'id,type\n' + '1,a,\n' * 1010
        invalid_file = StringIO.StringIO()

        invalid_file.write(invalid_csv)

        mock_upload = MockFieldStorage(invalid_file, 'invalid.csv')

        resource = factories.Resource(format='csv', upload=mock_upload)

        invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv))

        with mock.patch('io.open', return_value=invalid_stream):

            run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        source = validation.report['tables'][0]['source']
        assert source.startswith('http')
        assert source.endswith('invalid.csv')

        warning = validation.report['warnings'][0]
        assert_equals(warning,
                      'Table inspection has reached 1000 row(s) limit')
    def test_job_run_schema(self, mock_get_action, mock_commit, mock_validate):

        org = factories.Organization()
        dataset = factories.Dataset(private=True, owner_org=org['id'])

        schema = {
            'fields': [{
                'name': 'id',
                'type': 'integer'
            }, {
                'name': 'description',
                'type': 'string'
            }]
        }
        resource = {
            'id': 'test',
            'url': 'http://example.com/file.csv',
            'format': 'csv',
            'schema': json.dumps(schema),
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('http://example.com/file.csv',
                                         format='csv',
                                         schema=schema)
    def test_job_pass_validation_options_string(self, mock_open):

        invalid_csv = '''

a;b;c
#comment
1;2;3
'''

        validation_options = '''{
            "headers": 3,
            "skip_rows": ["#"]
        }'''

        invalid_file = StringIO.StringIO()

        invalid_file.write(invalid_csv)

        mock_upload = MockFieldStorage(invalid_file, 'invalid.csv')

        resource = factories.Resource(format='csv',
                                      upload=mock_upload,
                                      validation_options=validation_options)

        invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv))

        with mock.patch('io.open', return_value=invalid_stream):

            run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        assert_equals(validation.report['valid'], True)
Example #4
0
    def test_job_run_schema(self, mock_requests, mock_get_action, mock_commit,
                            mock_validate, dataset):
        schema = {
            'fields': [{
                'name': 'id',
                'type': 'integer'
            }, {
                'name': 'description',
                'type': 'string'
            }]
        }
        resource = {
            'id': 'test',
            'url': 'http://example.com/file.csv',
            'format': 'csv',
            'schema': json.dumps(schema),
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('http://example.com/file.csv',
                                         format='csv',
                                         http_session='Some_Session',
                                         schema=schema)
Example #5
0
    def test_job_pass_validation_options(self, mock_open):

        invalid_csv = b'''

a,b,c
#comment
1,2,3
'''

        validation_options = {'headers': 3, 'skip_rows': ['#']}

        invalid_file = io.BytesIO(invalid_csv)

        mock_upload = MockFieldStorage(invalid_file, 'invalid.csv')

        resource = factories.Resource(format='csv',
                                      upload=mock_upload,
                                      validation_options=validation_options)

        invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv))

        with mock.patch('io.open', return_value=invalid_stream):

            run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        assert validation.report['valid']
Example #6
0
def resource_validation_run(context, data_dict):
    u'''
    Start a validation job against a resource.
    Returns the identifier for the job started.

    Note that the resource format must be one of the supported ones,
    currently CSV or Excel.

    :param resource_id: id of the resource to validate
    :type resource_id: string

    :rtype: string

    '''

    t.check_access(u'resource_validation_run', context, data_dict)

    resource_id = data_dict.get(u'resource_id')
    if not resource_id:
        raise t.ValidationError({u'resource_id': u'Missing value'})

    resource = t.get_action(u'resource_show')(
        {}, {u'id': resource_id})

    # TODO: limit to sysadmins
    async_job = data_dict.get(u'async', True)

    # Ensure format is supported
    if not resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS:
        raise t.ValidationError(
            {u'format': u'Unsupported resource format.'
             u'Must be one of {}'.format(
                 u','.join(settings.SUPPORTED_FORMATS))})

    # Ensure there is a URL or file upload
    if not resource.get(u'url') and not resource.get(u'url_type') == u'upload':
        raise t.ValidationError(
            {u'url': u'Resource must have a valid URL or an uploaded file'})

    # Check if there was an existing validation for the resource
    try:
        session = context['model'].Session
        ValidationStatusHelper().createValidationJob(session, resource_id)
    except ValidationJobAlreadyEnqueued:
        if async_job:
            log.error("resource_validation_run: ValidationJobAlreadyEnqueued %s", data_dict['resource_id'])
            return

    if async_job:
        package_id = resource['package_id']
        enqueue_validation_job(package_id, resource_id)
    else:
        run_validation_job(resource)
    def test_job_run_uploaded_file_replaces_paths(self, mock_uploader,
                                                  mock_validate):

        resource = factories.Resource(url='__upload',
                                      url_type='upload',
                                      format='csv')

        run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        assert validation.report['tables'][0]['source'].startswith('http')
    def test_job_run_invalid_stores_validation_object(self, mock_validate):

        resource = factories.Resource(url='http://example.com/file.csv',
                                      format='csv')

        run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        assert_equals(validation.status, 'failure')
        assert_equals(validation.report, INVALID_REPORT)
        assert validation.finished
    def test_job_run_error_stores_validation_object(self, mock_validate):

        resource = factories.Resource(url='http://example.com/file.csv',
                                      format='csv')

        run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        assert_equals(validation.status, 'error')
        assert_equals(validation.report, None)
        assert_equals(validation.error, {'message': 'Some warning'})
        assert validation.finished
Example #10
0
    def test_job_run_valid_stores_status_in_resource(self, mock_validate):

        resource = factories.Resource(url='http://example.com/file.csv',
                                      format='csv')

        run_validation_job(resource)

        validation = Session.query(Validation).filter(
            Validation.resource_id == resource['id']).one()

        updated_resource = call_action('resource_show', id=resource['id'])

        assert_equals(updated_resource['validation_status'], validation.status)
        assert_equals(updated_resource['validation_timestamp'],
                      validation.finished.isoformat())
Example #11
0
    def test_job_run_no_schema(self, mock_requests, mock_get_action,
                               mock_commit, mock_validate, dataset):
        resource = {
            'id': 'test',
            'url': 'http://example.com/file.csv',
            'format': 'csv',
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('http://example.com/file.csv',
                                         format='csv',
                                         http_session='Some_Session',
                                         schema=None)
Example #12
0
    def test_job_run_no_schema(self, mock_get_action, mock_commit,
                               mock_validate):

        org = factories.Organization()
        dataset = factories.Dataset(private=True, owner_org=org['id'])

        resource = {
            'id': 'test',
            'url': 'http://example.com/file.csv',
            'format': 'csv',
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('http://example.com/file.csv',
                                         format='csv',
                                         schema=None)
Example #13
0
    def test_job_run_uploaded_file(self, mock_requests, mock_get_action,
                                   mock_commit, mock_uploader, mock_validate,
                                   dataset):
        resource = {
            'id': 'test',
            'url': '__upload',
            'url_type': 'upload',
            'format': 'csv',
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('/tmp/example/{}'.format(
            resource['id']),
                                         format='csv',
                                         http_session='Some_Session',
                                         schema=None)
Example #14
0
    def test_job_run_uploaded_file(self, mock_get_action, mock_commit,
                                   mock_uploader, mock_validate):

        org = factories.Organization()
        dataset = factories.Dataset(private=True, owner_org=org['id'])

        resource = {
            'id': 'test',
            'url': '__upload',
            'url_type': 'upload',
            'format': 'csv',
            'package_id': dataset['id'],
        }

        run_validation_job(resource)

        mock_validate.assert_called_with('/tmp/example/{}'.format(
            resource['id']),
                                         format='csv',
                                         schema=None)
Example #15
0
def resource_validation_run(context, data_dict):
    u'''
    Start a validation job against a resource.
    Returns the identifier for the job started.

    Note that the resource format must be one of the supported ones,
    currently CSV or Excel.

    :param resource_id: id of the resource to validate
    :type resource_id: string

    :rtype: string

    '''

    t.check_access(u'resource_validation_run', context, data_dict)

    if not data_dict.get(u'resource_id'):
        raise t.ValidationError({u'resource_id': u'Missing value'})

    resource = t.get_action(u'resource_show')({}, {
        u'id': data_dict[u'resource_id']
    })

    # TODO: limit to sysadmins
    async_job = data_dict.get(u'async', True)

    # Ensure format is supported
    if not resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS:
        raise t.ValidationError({
            u'format':
            u'Unsupported resource format.' +
            u'Must be one of {}'.format(u','.join(settings.SUPPORTED_FORMATS))
        })

    # Ensure there is a URL or file upload
    if not resource.get(u'url') and not resource.get(u'url_type') == u'upload':
        raise t.ValidationError(
            {u'url': u'Resource must have a valid URL or an uploaded file'})

    # Check if there was an existing validation for the resource

    Session = context['model'].Session

    try:
        validation = Session.query(Validation).filter(
            Validation.resource_id == data_dict['resource_id']).one()
    except NoResultFound:
        validation = None

    if validation:
        # Reset values
        validation.finished = None
        validation.report = None
        validation.error = None
        validation.created = datetime.datetime.utcnow()
        validation.status = u'created'
    else:
        validation = Validation(resource_id=resource['id'])

    Session.add(validation)
    Session.commit()

    if async_job:
        enqueue_job(run_validation_job, [resource])
    else:
        run_validation_job(resource)