def test_job_local_paths_are_hidden(self, mock_open): invalid_csv = 'id,type\n' + '1,a,\n' * 1010 invalid_file = StringIO.StringIO() invalid_file.write(invalid_csv) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') resource = factories.Resource(format='csv', upload=mock_upload) invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv)) with mock.patch('io.open', return_value=invalid_stream): run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() source = validation.report['tables'][0]['source'] assert source.startswith('http') assert source.endswith('invalid.csv') warning = validation.report['warnings'][0] assert_equals(warning, 'Table inspection has reached 1000 row(s) limit')
def test_job_run_schema(self, mock_get_action, mock_commit, mock_validate): org = factories.Organization() dataset = factories.Dataset(private=True, owner_org=org['id']) schema = { 'fields': [{ 'name': 'id', 'type': 'integer' }, { 'name': 'description', 'type': 'string' }] } resource = { 'id': 'test', 'url': 'http://example.com/file.csv', 'format': 'csv', 'schema': json.dumps(schema), 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('http://example.com/file.csv', format='csv', schema=schema)
def test_job_pass_validation_options_string(self, mock_open): invalid_csv = ''' a;b;c #comment 1;2;3 ''' validation_options = '''{ "headers": 3, "skip_rows": ["#"] }''' invalid_file = StringIO.StringIO() invalid_file.write(invalid_csv) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') resource = factories.Resource(format='csv', upload=mock_upload, validation_options=validation_options) invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv)) with mock.patch('io.open', return_value=invalid_stream): run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() assert_equals(validation.report['valid'], True)
def test_job_run_schema(self, mock_requests, mock_get_action, mock_commit, mock_validate, dataset): schema = { 'fields': [{ 'name': 'id', 'type': 'integer' }, { 'name': 'description', 'type': 'string' }] } resource = { 'id': 'test', 'url': 'http://example.com/file.csv', 'format': 'csv', 'schema': json.dumps(schema), 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('http://example.com/file.csv', format='csv', http_session='Some_Session', schema=schema)
def test_job_pass_validation_options(self, mock_open): invalid_csv = b''' a,b,c #comment 1,2,3 ''' validation_options = {'headers': 3, 'skip_rows': ['#']} invalid_file = io.BytesIO(invalid_csv) mock_upload = MockFieldStorage(invalid_file, 'invalid.csv') resource = factories.Resource(format='csv', upload=mock_upload, validation_options=validation_options) invalid_stream = io.BufferedReader(io.BytesIO(invalid_csv)) with mock.patch('io.open', return_value=invalid_stream): run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() assert validation.report['valid']
def resource_validation_run(context, data_dict): u''' Start a validation job against a resource. Returns the identifier for the job started. Note that the resource format must be one of the supported ones, currently CSV or Excel. :param resource_id: id of the resource to validate :type resource_id: string :rtype: string ''' t.check_access(u'resource_validation_run', context, data_dict) resource_id = data_dict.get(u'resource_id') if not resource_id: raise t.ValidationError({u'resource_id': u'Missing value'}) resource = t.get_action(u'resource_show')( {}, {u'id': resource_id}) # TODO: limit to sysadmins async_job = data_dict.get(u'async', True) # Ensure format is supported if not resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS: raise t.ValidationError( {u'format': u'Unsupported resource format.' u'Must be one of {}'.format( u','.join(settings.SUPPORTED_FORMATS))}) # Ensure there is a URL or file upload if not resource.get(u'url') and not resource.get(u'url_type') == u'upload': raise t.ValidationError( {u'url': u'Resource must have a valid URL or an uploaded file'}) # Check if there was an existing validation for the resource try: session = context['model'].Session ValidationStatusHelper().createValidationJob(session, resource_id) except ValidationJobAlreadyEnqueued: if async_job: log.error("resource_validation_run: ValidationJobAlreadyEnqueued %s", data_dict['resource_id']) return if async_job: package_id = resource['package_id'] enqueue_validation_job(package_id, resource_id) else: run_validation_job(resource)
def test_job_run_uploaded_file_replaces_paths(self, mock_uploader, mock_validate): resource = factories.Resource(url='__upload', url_type='upload', format='csv') run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() assert validation.report['tables'][0]['source'].startswith('http')
def test_job_run_invalid_stores_validation_object(self, mock_validate): resource = factories.Resource(url='http://example.com/file.csv', format='csv') run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() assert_equals(validation.status, 'failure') assert_equals(validation.report, INVALID_REPORT) assert validation.finished
def test_job_run_error_stores_validation_object(self, mock_validate): resource = factories.Resource(url='http://example.com/file.csv', format='csv') run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() assert_equals(validation.status, 'error') assert_equals(validation.report, None) assert_equals(validation.error, {'message': 'Some warning'}) assert validation.finished
def test_job_run_valid_stores_status_in_resource(self, mock_validate): resource = factories.Resource(url='http://example.com/file.csv', format='csv') run_validation_job(resource) validation = Session.query(Validation).filter( Validation.resource_id == resource['id']).one() updated_resource = call_action('resource_show', id=resource['id']) assert_equals(updated_resource['validation_status'], validation.status) assert_equals(updated_resource['validation_timestamp'], validation.finished.isoformat())
def test_job_run_no_schema(self, mock_requests, mock_get_action, mock_commit, mock_validate, dataset): resource = { 'id': 'test', 'url': 'http://example.com/file.csv', 'format': 'csv', 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('http://example.com/file.csv', format='csv', http_session='Some_Session', schema=None)
def test_job_run_no_schema(self, mock_get_action, mock_commit, mock_validate): org = factories.Organization() dataset = factories.Dataset(private=True, owner_org=org['id']) resource = { 'id': 'test', 'url': 'http://example.com/file.csv', 'format': 'csv', 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('http://example.com/file.csv', format='csv', schema=None)
def test_job_run_uploaded_file(self, mock_requests, mock_get_action, mock_commit, mock_uploader, mock_validate, dataset): resource = { 'id': 'test', 'url': '__upload', 'url_type': 'upload', 'format': 'csv', 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('/tmp/example/{}'.format( resource['id']), format='csv', http_session='Some_Session', schema=None)
def test_job_run_uploaded_file(self, mock_get_action, mock_commit, mock_uploader, mock_validate): org = factories.Organization() dataset = factories.Dataset(private=True, owner_org=org['id']) resource = { 'id': 'test', 'url': '__upload', 'url_type': 'upload', 'format': 'csv', 'package_id': dataset['id'], } run_validation_job(resource) mock_validate.assert_called_with('/tmp/example/{}'.format( resource['id']), format='csv', schema=None)
def resource_validation_run(context, data_dict): u''' Start a validation job against a resource. Returns the identifier for the job started. Note that the resource format must be one of the supported ones, currently CSV or Excel. :param resource_id: id of the resource to validate :type resource_id: string :rtype: string ''' t.check_access(u'resource_validation_run', context, data_dict) if not data_dict.get(u'resource_id'): raise t.ValidationError({u'resource_id': u'Missing value'}) resource = t.get_action(u'resource_show')({}, { u'id': data_dict[u'resource_id'] }) # TODO: limit to sysadmins async_job = data_dict.get(u'async', True) # Ensure format is supported if not resource.get(u'format', u'').lower() in settings.SUPPORTED_FORMATS: raise t.ValidationError({ u'format': u'Unsupported resource format.' + u'Must be one of {}'.format(u','.join(settings.SUPPORTED_FORMATS)) }) # Ensure there is a URL or file upload if not resource.get(u'url') and not resource.get(u'url_type') == u'upload': raise t.ValidationError( {u'url': u'Resource must have a valid URL or an uploaded file'}) # Check if there was an existing validation for the resource Session = context['model'].Session try: validation = Session.query(Validation).filter( Validation.resource_id == data_dict['resource_id']).one() except NoResultFound: validation = None if validation: # Reset values validation.finished = None validation.report = None validation.error = None validation.created = datetime.datetime.utcnow() validation.status = u'created' else: validation = Validation(resource_id=resource['id']) Session.add(validation) Session.commit() if async_job: enqueue_job(run_validation_job, [resource]) else: run_validation_job(resource)