def test_resource_set_detector(): detector_set_init = Detector(field_missing_values=["na"]) resource = Resource("data/table.csv", detector=detector_set_init) assert resource.detector == detector_set_init detector_set = Detector(sample_size=3) resource.detector = detector_set assert resource.detector == detector_set
def test_xlsx_parser_preserve_formatting(): source = "data/preserve-formatting.xlsx" dialect = ExcelDialect(preserve_formatting=True) layout = Layout(header_rows=[1]) detector = Detector(field_type="any") if IS_UNIX: with Resource(source, dialect=dialect, layout=layout, detector=detector) as resource: assert resource.read_rows() == [{ # general "empty": None, # numeric "0": "1001", "0.00": "1000.56", "0.0000": "1000.5577", "0.00000": "1000.55770", "0.0000#": "1000.5577", # temporal "m/d/yy": "5/20/40", "d-mmm": "20-May", "mm/dd/yy": "05/20/40", "mmddyy": "052040", "mmddyyam/pmdd": "052040AM20", }]
def test_resource_detector_field_type_with_open(): detector = Detector(field_type="string") with Resource("data/table.csv", detector=detector) as resource: assert resource.header == ["id", "name"] assert resource.schema == { "fields": [ { "name": "id", "type": "string" }, { "name": "name", "type": "string" }, ] } assert resource.read_rows() == [ { "id": "1", "name": "english" }, { "id": "2", "name": "中国人" }, ]
def test_resource_detector_field_type(): detector = Detector(field_type="string") resource = Resource(path="data/table.csv", detector=detector) resource.infer(stats=True) assert resource.schema == { "fields": [ { "name": "id", "type": "string" }, { "name": "name", "type": "string" }, ] } assert resource.header == ["id", "name"] assert resource.read_rows() == [ { "id": "1", "name": "english" }, { "id": "2", "name": "中国人" }, ]
def test_resource_detector_field_float_numbers(): data = [["number"], ["1.1"], ["2.2"], ["3.3"]] detector = Detector(field_float_numbers=True) resource = Resource(data=data, detector=detector) resource.infer(stats=True) assert resource.schema == { "fields": [ { "name": "number", "type": "number", "floatNumber": True }, ] } assert resource.header == ["number"] assert resource.read_rows() == [ { "number": 1.1 }, { "number": 2.2 }, { "number": 3.3 }, ]
def test_resource_detector_field_names(): detector = Detector(field_names=["new1", "new2"]) resource = Resource(path="data/table.csv", detector=detector) resource.infer(stats=True) assert resource.schema == { "fields": [ { "name": "new1", "type": "integer" }, { "name": "new2", "type": "string" }, ] } assert resource.labels == ["id", "name"] assert resource.header == ["new1", "new2"] assert resource.read_rows() == [ { "new1": 1, "new2": "english" }, { "new1": 2, "new2": "中国人" }, ]
def test_resource_detector_schema_patch_with_infer(): detector = Detector( schema_patch={"fields": { "id": { "name": "ID", "type": "string" } }}) resource = Resource(path="data/table.csv", detector=detector) resource.infer(stats=True) assert resource.schema == { "fields": [ { "name": "ID", "type": "string" }, { "name": "name", "type": "string" }, ] } assert resource.labels == ["id", "name"] assert resource.header == ["ID", "name"] assert resource.read_rows() == [ { "ID": "1", "name": "english" }, { "ID": "2", "name": "中国人" }, ]
def test_resource_detector_schema_patch_missing_values(): detector = Detector(schema_patch={"missingValues": ["1", "2"]}) with Resource("data/table.csv", detector=detector) as resource: assert resource.header == ["id", "name"] assert resource.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ], "missingValues": ["1", "2"], } assert resource.read_rows() == [ { "id": None, "name": "english" }, { "id": None, "name": "中国人" }, ]
def test_resource_detector_schema_sync_with_infer(): schema = { "fields": [ { "name": "name", "type": "string" }, { "name": "id", "type": "integer" }, ] } detector = Detector(schema_sync=True) resource = Resource(path="data/sync-schema.csv", schema=schema, detector=detector) resource.infer(stats=True) assert resource.schema == schema assert resource.sample == [["name", "id"], ["english", "1"], ["中国人", "2"]] assert resource.fragment == [["english", "1"], ["中国人", "2"]] assert resource.header == ["name", "id"] assert resource.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_resource_detector_field_names_with_open(): detector = Detector(field_names=["new1", "new2"]) with Resource("data/table.csv", detector=detector) as resource: assert resource.schema == { "fields": [ { "name": "new1", "type": "integer" }, { "name": "new2", "type": "string" }, ] } assert resource.labels == ["id", "name"] assert resource.header == ["new1", "new2"] assert resource.read_rows() == [ { "new1": 1, "new2": "english" }, { "new1": 2, "new2": "中国人" }, ]
def test_resource_control(): detector = Detector(encoding_function=lambda sample: "utf-8") with Resource("data/table.csv", detector=detector) as resource: assert resource.encoding == "utf-8" assert resource.sample == [["id", "name"], ["1", "english"], ["2", "中国人"]] assert resource.fragment == [["1", "english"], ["2", "中国人"]] assert resource.header == ["id", "name"]
def test_validate_infer_fields_issue_223(): source = [["name1", "name2"], ["123", "abc"], ["456", "def"], ["789", "ghi"]] detector = Detector(schema_patch={"fields": {"name": {"type": "string"}}}) resource = Resource(source, detector=detector) report = resource.validate() assert report.valid
def test_schema_infer_no_names(): sample = [[1], [2], [3]] detector = Detector() schema = detector.detect_schema(sample) assert schema == { "fields": [{"name": "field1", "type": "integer"}], }
def test_detector_set_encoding_function(): enc_func = lambda sample: "utf-8" detector = Detector(encoding_function=enc_func) assert detector.encoding_function == enc_func enc_func = lambda sample: "utf-16" detector.encoding_function = enc_func assert detector.encoding_function == enc_func
def test_describe_resource_schema_check_type_boolean_string_tie(): layout = Layout(header=False) detector = Detector(field_names=["field"]) resource = Resource.describe([["f"], ["stringish"]], layout=layout, detector=detector) assert resource.schema.get_field("field").type == "string"
def test_validate_detector_headers_errors(): source = [ ["id", "last_name", "first_name", "language"], [1, "Alex", "John", "English"], [2, "Peters", "John", "Afrikaans"], [3, "Smith", "Paul", None], ] schema = { "fields": [ { "name": "id", "type": "number" }, { "name": "language", "constraints": { "required": True } }, { "name": "country" }, ] } detector = Detector(schema_sync=True) report = validate(source, schema=schema, detector=detector) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [4, 4, "constraint-error"], ]
def test_validate_field_missing_values(): result = runner.invoke( program, "validate data/table.csv --json --field-missing-values 1") assert result.exit_code == 0 assert no_time(json.loads(result.stdout)) == no_time( validate("data/table.csv", detector=Detector(field_missing_values=["1"])))
def test_validate_source_invalid(): # Reducing sample size to get raise on iter, not on open detector = Detector(sample_size=1) report = validate([["h"], [1], "bad"], detector=detector) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [None, None, "source-error"], ]
def test_validate_detector_sync_schema(): schema = { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ], } detector = Detector(schema_sync=True) report = validate("data/sync-schema.csv", schema=schema, detector=detector) assert report.valid assert report.task.resource.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "id", "type": "integer" }, ], }
def test_validate_infer_fields_issue_225(): source = [["name1", "name2"], ["123", None], ["456", None], ["789"]] detector = Detector(schema_patch={"fields": {"name": {"type": "string"}}}) report = validate(source, detector=detector) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [4, 2, "missing-cell"], ]
def test_validate_order_fields_issue_313(): source = "data/issue-313.xlsx" layout = Layout(pick_fields=[1, 2, 3, 4, 5]) schema = { "fields": [ { "name": "Column_1", "type": "string" }, { "name": "Column_2", "type": "string", "constraints": { "required": True } }, { "name": "Column_3", "type": "string" }, { "name": "Column_4", "type": "string" }, { "name": "Column_5", "type": "string" }, ] } detector = Detector(schema_sync=True) report = validate(source, layout=layout, schema=schema, detector=detector) assert report.valid
def test_validate_detector_infer_type_any(): detector = Detector(field_type="any") report = validate("data/table.csv", detector=detector) assert report.valid assert report.task.resource.schema == { "fields": [{"name": "id", "type": "any"}, {"name": "name", "type": "any"}], }
def test_schema_from_sparse_sample(): labels = ["id", "age", "name"] sample = [ ["1", "39", "Paul"], ["2", "23", "Jimmy"], ["3", "", "Jane"], ["4", "", "Judy"], ] detector = Detector(field_confidence=1) schema = detector.detect_schema(sample, labels=labels) assert schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "age", "type": "integer" }, { "name": "name", "type": "string" }, ], }
def test_program_describe_infer_missing_values(): result = runner.invoke( program, "describe data/table.csv --json --field-missing-values 1" ) assert result.exit_code == 0 assert json.loads(result.stdout) == describe( "data/table.csv", detector=Detector(field_missing_values=["1"]) )
def test_program_extract_field_missing_values(): result = runner.invoke( program, "extract data/table.csv --json --field-missing-values 1" ) assert result.exit_code == 0 assert json.loads(result.stdout) == extract( "data/table.csv", detector=Detector(field_missing_values=["1"]) )
def test_program_extract_sync_schema(): result = runner.invoke( program, "extract data/table.csv --json --schema data/schema-reverse.json --schema-sync", ) assert result.exit_code == 0 assert json.loads(result.stdout) == extract( "data/table.csv", schema="data/schema.json", detector=Detector(schema_sync=True) )
def test_resource_schema_primary_key_error(): source = [["name"], [1], [2], [2]] detector = Detector(schema_patch={"primaryKey": ["name"]}) with Resource(source, detector=detector) as resource: for row in resource: if row.row_number == 3: assert row.valid is False assert row.errors[0].code == "primary-key-error" continue assert row.valid
def test_validate_wide_table_with_order_fields_issue_277(): source = "data/issue-277.csv" schema = "data/issue-277.json" detector = Detector(schema_sync=True) report = validate(source, schema=schema, detector=detector) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [49, 50, "constraint-error"], [68, 50, "constraint-error"], [69, 50, "constraint-error"], ]
def test_validate_detector_patch_schema_fields(): detector = Detector( schema_patch={"fields": {"id": {"type": "string"}}, "missingValues": ["-"]} ) report = validate("data/table.csv", detector=detector) assert report.valid assert report.task.resource.schema == { "fields": [{"name": "id", "type": "string"}, {"name": "name", "type": "string"}], "missingValues": ["-"], }
def test_csv_parser_format_tsv(): detector = Detector(schema_patch={"missingValues": ["\\N"]}) with Resource("data/table.tsv", detector=detector) as resource: assert resource.dialect == {"delimiter": "\t"} assert resource.header == ["id", "name"] assert resource.read_rows() == [ {"id": 1, "name": "english"}, {"id": 2, "name": "中国人"}, {"id": 3, "name": None}, ]