Example #1
0
def test_resource_set_detector():
    detector_set_init = Detector(field_missing_values=["na"])
    resource = Resource("data/table.csv", detector=detector_set_init)
    assert resource.detector == detector_set_init
    detector_set = Detector(sample_size=3)
    resource.detector = detector_set
    assert resource.detector == detector_set
Example #2
0
def test_xlsx_parser_preserve_formatting():
    source = "data/preserve-formatting.xlsx"
    dialect = ExcelDialect(preserve_formatting=True)
    layout = Layout(header_rows=[1])
    detector = Detector(field_type="any")
    if IS_UNIX:
        with Resource(source,
                      dialect=dialect,
                      layout=layout,
                      detector=detector) as resource:
            assert resource.read_rows() == [{
                # general
                "empty": None,
                # numeric
                "0": "1001",
                "0.00": "1000.56",
                "0.0000": "1000.5577",
                "0.00000": "1000.55770",
                "0.0000#": "1000.5577",
                # temporal
                "m/d/yy": "5/20/40",
                "d-mmm": "20-May",
                "mm/dd/yy": "05/20/40",
                "mmddyy": "052040",
                "mmddyyam/pmdd": "052040AM20",
            }]
Example #3
0
def test_resource_detector_field_type_with_open():
    detector = Detector(field_type="string")
    with Resource("data/table.csv", detector=detector) as resource:
        assert resource.header == ["id", "name"]
        assert resource.schema == {
            "fields": [
                {
                    "name": "id",
                    "type": "string"
                },
                {
                    "name": "name",
                    "type": "string"
                },
            ]
        }
        assert resource.read_rows() == [
            {
                "id": "1",
                "name": "english"
            },
            {
                "id": "2",
                "name": "中国人"
            },
        ]
Example #4
0
def test_resource_detector_field_type():
    detector = Detector(field_type="string")
    resource = Resource(path="data/table.csv", detector=detector)
    resource.infer(stats=True)
    assert resource.schema == {
        "fields": [
            {
                "name": "id",
                "type": "string"
            },
            {
                "name": "name",
                "type": "string"
            },
        ]
    }
    assert resource.header == ["id", "name"]
    assert resource.read_rows() == [
        {
            "id": "1",
            "name": "english"
        },
        {
            "id": "2",
            "name": "中国人"
        },
    ]
Example #5
0
def test_resource_detector_field_float_numbers():
    data = [["number"], ["1.1"], ["2.2"], ["3.3"]]
    detector = Detector(field_float_numbers=True)
    resource = Resource(data=data, detector=detector)
    resource.infer(stats=True)
    assert resource.schema == {
        "fields": [
            {
                "name": "number",
                "type": "number",
                "floatNumber": True
            },
        ]
    }
    assert resource.header == ["number"]
    assert resource.read_rows() == [
        {
            "number": 1.1
        },
        {
            "number": 2.2
        },
        {
            "number": 3.3
        },
    ]
Example #6
0
def test_resource_detector_field_names():
    detector = Detector(field_names=["new1", "new2"])
    resource = Resource(path="data/table.csv", detector=detector)
    resource.infer(stats=True)
    assert resource.schema == {
        "fields": [
            {
                "name": "new1",
                "type": "integer"
            },
            {
                "name": "new2",
                "type": "string"
            },
        ]
    }
    assert resource.labels == ["id", "name"]
    assert resource.header == ["new1", "new2"]
    assert resource.read_rows() == [
        {
            "new1": 1,
            "new2": "english"
        },
        {
            "new1": 2,
            "new2": "中国人"
        },
    ]
Example #7
0
def test_resource_detector_schema_patch_with_infer():
    detector = Detector(
        schema_patch={"fields": {
            "id": {
                "name": "ID",
                "type": "string"
            }
        }})
    resource = Resource(path="data/table.csv", detector=detector)
    resource.infer(stats=True)
    assert resource.schema == {
        "fields": [
            {
                "name": "ID",
                "type": "string"
            },
            {
                "name": "name",
                "type": "string"
            },
        ]
    }
    assert resource.labels == ["id", "name"]
    assert resource.header == ["ID", "name"]
    assert resource.read_rows() == [
        {
            "ID": "1",
            "name": "english"
        },
        {
            "ID": "2",
            "name": "中国人"
        },
    ]
Example #8
0
def test_resource_detector_schema_patch_missing_values():
    detector = Detector(schema_patch={"missingValues": ["1", "2"]})
    with Resource("data/table.csv", detector=detector) as resource:
        assert resource.header == ["id", "name"]
        assert resource.schema == {
            "fields": [
                {
                    "name": "id",
                    "type": "integer"
                },
                {
                    "name": "name",
                    "type": "string"
                },
            ],
            "missingValues": ["1", "2"],
        }
        assert resource.read_rows() == [
            {
                "id": None,
                "name": "english"
            },
            {
                "id": None,
                "name": "中国人"
            },
        ]
Example #9
0
def test_resource_detector_schema_sync_with_infer():
    schema = {
        "fields": [
            {
                "name": "name",
                "type": "string"
            },
            {
                "name": "id",
                "type": "integer"
            },
        ]
    }
    detector = Detector(schema_sync=True)
    resource = Resource(path="data/sync-schema.csv",
                        schema=schema,
                        detector=detector)
    resource.infer(stats=True)
    assert resource.schema == schema
    assert resource.sample == [["name", "id"], ["english", "1"], ["中国人", "2"]]
    assert resource.fragment == [["english", "1"], ["中国人", "2"]]
    assert resource.header == ["name", "id"]
    assert resource.read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
Example #10
0
def test_resource_detector_field_names_with_open():
    detector = Detector(field_names=["new1", "new2"])
    with Resource("data/table.csv", detector=detector) as resource:
        assert resource.schema == {
            "fields": [
                {
                    "name": "new1",
                    "type": "integer"
                },
                {
                    "name": "new2",
                    "type": "string"
                },
            ]
        }
        assert resource.labels == ["id", "name"]
        assert resource.header == ["new1", "new2"]
        assert resource.read_rows() == [
            {
                "new1": 1,
                "new2": "english"
            },
            {
                "new1": 2,
                "new2": "中国人"
            },
        ]
def test_resource_control():
    detector = Detector(encoding_function=lambda sample: "utf-8")
    with Resource("data/table.csv", detector=detector) as resource:
        assert resource.encoding == "utf-8"
        assert resource.sample == [["id", "name"], ["1", "english"], ["2", "中国人"]]
        assert resource.fragment == [["1", "english"], ["2", "中国人"]]
        assert resource.header == ["id", "name"]
Example #12
0
def test_validate_infer_fields_issue_223():
    source = [["name1", "name2"], ["123", "abc"], ["456", "def"],
              ["789", "ghi"]]
    detector = Detector(schema_patch={"fields": {"name": {"type": "string"}}})
    resource = Resource(source, detector=detector)
    report = resource.validate()
    assert report.valid
Example #13
0
def test_schema_infer_no_names():
    sample = [[1], [2], [3]]
    detector = Detector()
    schema = detector.detect_schema(sample)
    assert schema == {
        "fields": [{"name": "field1", "type": "integer"}],
    }
def test_detector_set_encoding_function():
    enc_func = lambda sample: "utf-8"
    detector = Detector(encoding_function=enc_func)
    assert detector.encoding_function == enc_func
    enc_func = lambda sample: "utf-16"
    detector.encoding_function = enc_func
    assert detector.encoding_function == enc_func
Example #15
0
def test_describe_resource_schema_check_type_boolean_string_tie():
    layout = Layout(header=False)
    detector = Detector(field_names=["field"])
    resource = Resource.describe([["f"], ["stringish"]],
                                 layout=layout,
                                 detector=detector)
    assert resource.schema.get_field("field").type == "string"
Example #16
0
def test_validate_detector_headers_errors():
    source = [
        ["id", "last_name", "first_name", "language"],
        [1, "Alex", "John", "English"],
        [2, "Peters", "John", "Afrikaans"],
        [3, "Smith", "Paul", None],
    ]
    schema = {
        "fields": [
            {
                "name": "id",
                "type": "number"
            },
            {
                "name": "language",
                "constraints": {
                    "required": True
                }
            },
            {
                "name": "country"
            },
        ]
    }
    detector = Detector(schema_sync=True)
    report = validate(source, schema=schema, detector=detector)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [4, 4, "constraint-error"],
    ]
Example #17
0
def test_validate_field_missing_values():
    result = runner.invoke(
        program, "validate data/table.csv --json --field-missing-values 1")
    assert result.exit_code == 0
    assert no_time(json.loads(result.stdout)) == no_time(
        validate("data/table.csv",
                 detector=Detector(field_missing_values=["1"])))
Example #18
0
def test_validate_source_invalid():
    # Reducing sample size to get raise on iter, not on open
    detector = Detector(sample_size=1)
    report = validate([["h"], [1], "bad"], detector=detector)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [None, None, "source-error"],
    ]
Example #19
0
def test_validate_detector_sync_schema():
    schema = {
        "fields": [
            {
                "name": "id",
                "type": "integer"
            },
            {
                "name": "name",
                "type": "string"
            },
        ],
    }
    detector = Detector(schema_sync=True)
    report = validate("data/sync-schema.csv", schema=schema, detector=detector)
    assert report.valid
    assert report.task.resource.schema == {
        "fields": [
            {
                "name": "name",
                "type": "string"
            },
            {
                "name": "id",
                "type": "integer"
            },
        ],
    }
Example #20
0
def test_validate_infer_fields_issue_225():
    source = [["name1", "name2"], ["123", None], ["456", None], ["789"]]
    detector = Detector(schema_patch={"fields": {"name": {"type": "string"}}})
    report = validate(source, detector=detector)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [4, 2, "missing-cell"],
    ]
Example #21
0
def test_validate_order_fields_issue_313():
    source = "data/issue-313.xlsx"
    layout = Layout(pick_fields=[1, 2, 3, 4, 5])
    schema = {
        "fields": [
            {
                "name": "Column_1",
                "type": "string"
            },
            {
                "name": "Column_2",
                "type": "string",
                "constraints": {
                    "required": True
                }
            },
            {
                "name": "Column_3",
                "type": "string"
            },
            {
                "name": "Column_4",
                "type": "string"
            },
            {
                "name": "Column_5",
                "type": "string"
            },
        ]
    }
    detector = Detector(schema_sync=True)
    report = validate(source, layout=layout, schema=schema, detector=detector)
    assert report.valid
Example #22
0
def test_validate_detector_infer_type_any():
    detector = Detector(field_type="any")
    report = validate("data/table.csv", detector=detector)
    assert report.valid
    assert report.task.resource.schema == {
        "fields": [{"name": "id", "type": "any"}, {"name": "name", "type": "any"}],
    }
def test_schema_from_sparse_sample():
    labels = ["id", "age", "name"]
    sample = [
        ["1", "39", "Paul"],
        ["2", "23", "Jimmy"],
        ["3", "", "Jane"],
        ["4", "", "Judy"],
    ]
    detector = Detector(field_confidence=1)
    schema = detector.detect_schema(sample, labels=labels)
    assert schema == {
        "fields": [
            {
                "name": "id",
                "type": "integer"
            },
            {
                "name": "age",
                "type": "integer"
            },
            {
                "name": "name",
                "type": "string"
            },
        ],
    }
def test_program_describe_infer_missing_values():
    result = runner.invoke(
        program, "describe data/table.csv --json --field-missing-values 1"
    )
    assert result.exit_code == 0
    assert json.loads(result.stdout) == describe(
        "data/table.csv", detector=Detector(field_missing_values=["1"])
    )
Example #25
0
def test_program_extract_field_missing_values():
    result = runner.invoke(
        program, "extract data/table.csv --json --field-missing-values 1"
    )
    assert result.exit_code == 0
    assert json.loads(result.stdout) == extract(
        "data/table.csv", detector=Detector(field_missing_values=["1"])
    )
Example #26
0
def test_program_extract_sync_schema():
    result = runner.invoke(
        program,
        "extract data/table.csv --json --schema data/schema-reverse.json --schema-sync",
    )
    assert result.exit_code == 0
    assert json.loads(result.stdout) == extract(
        "data/table.csv", schema="data/schema.json", detector=Detector(schema_sync=True)
    )
def test_resource_schema_primary_key_error():
    source = [["name"], [1], [2], [2]]
    detector = Detector(schema_patch={"primaryKey": ["name"]})
    with Resource(source, detector=detector) as resource:
        for row in resource:
            if row.row_number == 3:
                assert row.valid is False
                assert row.errors[0].code == "primary-key-error"
                continue
            assert row.valid
Example #28
0
def test_validate_wide_table_with_order_fields_issue_277():
    source = "data/issue-277.csv"
    schema = "data/issue-277.json"
    detector = Detector(schema_sync=True)
    report = validate(source, schema=schema, detector=detector)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [49, 50, "constraint-error"],
        [68, 50, "constraint-error"],
        [69, 50, "constraint-error"],
    ]
Example #29
0
def test_validate_detector_patch_schema_fields():
    detector = Detector(
        schema_patch={"fields": {"id": {"type": "string"}}, "missingValues": ["-"]}
    )
    report = validate("data/table.csv", detector=detector)
    assert report.valid
    assert report.task.resource.schema == {
        "fields": [{"name": "id", "type": "string"}, {"name": "name", "type": "string"}],
        "missingValues": ["-"],
    }
Example #30
0
def test_csv_parser_format_tsv():
    detector = Detector(schema_patch={"missingValues": ["\\N"]})
    with Resource("data/table.tsv", detector=detector) as resource:
        assert resource.dialect == {"delimiter": "\t"}
        assert resource.header == ["id", "name"]
        assert resource.read_rows() == [
            {"id": 1, "name": "english"},
            {"id": 2, "name": "中国人"},
            {"id": 3, "name": None},
        ]