def test_pipeline_ignore_headerless_columns_false(self): filepath = os.path.join(self.data_dir, 'headerless_columns.csv') validator = Pipeline(filepath, processors=('structure', )) result, report = validator.run() self.assertFalse(result)
def test_register_processor_append(self): pipeline = Pipeline(self.data_string) self.assertEqual(len(pipeline.pipeline), 1) pipeline.register_processor('schema') self.assertEqual(len(pipeline.pipeline), 2)
def test_pipeline_infer_schema(self): filepath = os.path.join(self.data_dir, 'valid.csv') options = {'schema': {'infer_schema': True}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 0)
def test_multilingual_xlsx(self): data = os.path.join(self.data_dir, 'jungle', 'multilingual.xlsx') pipeline = Pipeline(data, format='excel') result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_messytables_source_six(self): data = os.path.join(self.data_dir, 'jungle', 'messytables-excel_properties.xls') pipeline = Pipeline(data, format='excel') result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_messytables_source_five(self): data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/characters.csv' pipeline = Pipeline(data) result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_pipeline_ignore_defective_rows_false(self): filepath = os.path.join(self.data_dir, 'defective_rows.csv') validator = Pipeline(filepath, processors=('structure',)) result, report = validator.run() self.assertFalse(result)
def test_rm_workspace(self): pipeline = Pipeline(self.data_string, dry_run=False) self.assertTrue(pipeline.workspace) pipeline.rm_workspace() self.assertFalse(os.path.exists(pipeline.workspace))
def test_register_processor_insert(self): pipeline = Pipeline(self.data_string) self.assertEqual(len(pipeline.pipeline), 1) pipeline.register_processor('schema', position=0) self.assertEqual(len(pipeline.pipeline), 2)
def test_pipeline_ignore_headerless_columns_false(self): filepath = os.path.join(self.data_dir, 'headerless_columns.csv') validator = Pipeline(filepath, processors=('structure',)) result, report = validator.run() self.assertFalse(result)
def test_pipeline_ignore_duplicate_rows_false(self): filepath = os.path.join(self.data_dir, 'duplicate_rows.csv') validator = Pipeline(filepath, processors=('structure', )) result, report = validator.run() self.assertFalse(result)
def test_messytables_source_two(self): data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/utf-16le_encoded.csv' pipeline = Pipeline(data) result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_messytables_source_three(self): data = 'https://raw.githubusercontent.com/okfn/messytables/master/horror/sparse_with_column_errors.csv' pipeline = Pipeline(data) result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_pipeline_empty_rows_are_not_duplicatable(self): filepath = os.path.join(self.data_dir, 'empty_rows_multiple.csv') validator = Pipeline(filepath, processors=('structure',), fail_fast=False) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 11)
def test_pipeline_case_insensitive_headers(self): filepath = os.path.join(self.data_dir, 'case_insensitive_headers.csv') schema = os.path.join(self.data_dir, 'test_schema.json') options = {'schema': {'schema': schema, 'case_insensitive_headers': True}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 0)
def test_gla_source_clean(self): data = 'https://raw.githubusercontent.com/rgrp/dataset-gla/master/data/all.csv' pipeline = Pipeline(data) result, report = pipeline.run() self.assertTrue(result) self.assertTrue(pipeline.data)
def test_pipeline_hmt_bbsrc(self): data = os.path.join(self.data_dir, 'hmt', '1011-bbsrc-25k-spend-return.csv') encoding = 'ISO-8859-2' pipeline = Pipeline(data, encoding=encoding) result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_gla_source_three(self): data = os.path.join(self.data_dir, 'jungle', 'gla-250-report-2014-15-P08.csv') pipeline = Pipeline(data) result, report = pipeline.run() self.assertFalse(result) self.assertTrue(pipeline.data)
def test_pipeline_field_unique(self): filepath = os.path.join(self.data_dir, 'unique_field.csv') schema = os.path.join(self.data_dir, 'unique_field.json') options = {'schema': {'schema': schema}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 1)
def test_header_index_invalid(self): filepath = os.path.join(self.data_dir, 'invalid_header_index_1.csv') options = {} validator = Pipeline(filepath, options=options, header_index=1) result, report = validator.run() self.assertFalse(result)
def test_gla_source_five(self): data = os.path.join(self.data_dir, 'jungle', 'gla-2012-13-P10-250.csv') pipeline = Pipeline(data) result, report = pipeline.run() self.assertFalse(result) self.assertTrue(pipeline.data)
def test_gla_source_six(self): data = os.path.join(self.data_dir, 'jungle', 'gla-december_2009.csv') pipeline = Pipeline(data) result, report = pipeline.run() self.assertFalse(result) self.assertTrue(pipeline.data)
def test_pipeline_info_result_for_required_false(self): filepath = os.path.join(self.data_dir, 'required_false.csv') schema = os.path.join(self.data_dir, 'required_false_schema.json') options = {'schema': {'schema': schema, 'result_level': 'info'}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 1)
def test_pipeline_row_limit_in_range(self): filepath = os.path.join(self.data_dir, 'row_limit_structure.csv') options = {} validator = Pipeline(filepath, processors=('structure',), row_limit=2, options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 0)
def test__report_limit_in_range(self): filepath = os.path.join(self.data_dir, 'report_limit_structure.csv') options = {} validator = Pipeline(filepath, processors=('structure',), report_limit=1, options=options) result, report = validator.run() self.assertEqual(len([r for r in report.generate()['results'] if r['processor'] == 'structure']), 1)
def test_pipeline_report_stream_none(self): filepath = os.path.join(self.data_dir, 'valid.csv') report_stream = None options = {} validator = Pipeline(filepath, processors=('schema',), report_stream=report_stream, options=options) result, report = validator.run() self.assertTrue(result)
def test_hmt_three(self): data = 'https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/407609/Publishable_December_2014_Spend.csv' schema = os.path.join(self.data_dir, 'hmt', 'spend-publishing-schema.json') options = {'schema': {'schema': schema}} pipeline = Pipeline(data, processors=('structure', 'schema'), options=options) result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_pipeline_custom_empty_strings(self): filepath = os.path.join(self.data_dir, 'empty_rows_custom.csv') options = {'structure': {'empty_strings': ('-',)}} validator = Pipeline(filepath, processors=('structure',), options=options) result, report = validator.run() self.assertFalse(result)
def test_pipeline_empty_rows_are_not_duplicatable(self): filepath = os.path.join(self.data_dir, 'empty_rows_multiple.csv') validator = Pipeline(filepath, processors=('structure', ), fail_fast=False) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 11)
def test_pipeline_fail_fast_false(self): filepath = os.path.join(self.data_dir, 'fail_fast_two_structure_errors.csv') options = {} validator = Pipeline(filepath, processors=('structure',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 2)
def test_pipeline_ignore_duplicate_rows_true(self): filepath = os.path.join(self.data_dir, 'duplicate_rows.csv') options = {'structure': {'ignore_duplicate_rows': True}} validator = Pipeline(filepath, processors=('structure',), options=options) result, report = validator.run() self.assertTrue(result)
def test_create_file(self): filepath = 'example.file' headers = ['first', 'second', 'three'] row = '1,2,3\n' pipeline = Pipeline(self.data_string, dry_run=False) pipeline.create_file(row, filepath, headers=headers) self.assertTrue(os.path.exists(os.path.join(pipeline.workspace, filepath)))
def test_report_summary(self): filepath = os.path.join(self.data_dir, 'invalid_header_index_1.csv') options = {} validator = Pipeline(filepath, options=options, header_index=1) result, report = validator.run() generated = report.generate() self.assertEqual(generated['meta']['bad_row_count'], 1) self.assertEqual(generated['meta']['row_count'], 9)
def test_report_results_grouped_by_rows(self): filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv') schema = os.path.join(self.data_dir, 'test_schema.json') options = {'schema': {'schema': schema}} validator = Pipeline(filepath, processors=('schema',), options=options, fail_fast=True, report_type='grouped') result, report = validator.run() generated = report.generate() self.assertEqual(1, len(generated['results']))
def test_hmt_bis_two(self): # excel data = os.path.join(self.data_dir, 'hmt', 'BIS_monthly_spend_December_2012.xls') schema = os.path.join(self.data_dir, 'hmt', 'bis-modified.json') options = {'schema': {'schema': schema}} pipeline = Pipeline(data, processors=('structure', 'schema'), options=options, format='excel') result, report = pipeline.run() self.assertTrue(pipeline.data)
def test_pipeline_custom_empty_strings(self): filepath = os.path.join(self.data_dir, 'empty_rows_custom.csv') options = {'structure': {'empty_strings': ('-', )}} validator = Pipeline(filepath, processors=('structure', ), options=options) result, report = validator.run() self.assertFalse(result)
def test_pipeline_fail_fast_false(self): filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv') schema = os.path.join(self.data_dir, 'test_schema.json') options = {'schema': {'schema': schema}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 7)
def test_pipeline_ignore_duplicate_rows_true(self): filepath = os.path.join(self.data_dir, 'duplicate_rows.csv') options = {'structure': {'ignore_duplicate_rows': True}} validator = Pipeline(filepath, processors=('structure', ), options=options) result, report = validator.run() self.assertTrue(result)
def test_pipeline_fail_fast_false(self): filepath = os.path.join(self.data_dir, 'fail_fast_two_schema_errors.csv') schema = os.path.join(self.data_dir, 'test_schema.json') options = {'schema': {'schema': schema}} validator = Pipeline(filepath, processors=('schema',), options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 5)
def test_pipeline_row_limit_in_range(self): filepath = os.path.join(self.data_dir, 'row_limit_structure.csv') options = {} validator = Pipeline(filepath, processors=('structure', ), row_limit=2, options=options) result, report = validator.run() self.assertEqual(len(report.generate()['results']), 0)