Exemple #1
0
def test_transform_package():
    source = describe("data/chunk*.csv")
    target = transform(
        source,
        steps=[
            steps.resource_transform(
                name="chunk1",
                steps=[
                    steps.table_merge(resource="chunk2"),
                ],
            ),
            steps.resource_remove(name="chunk2"),
        ],
    )
    assert target.resource_names == ["chunk1"]
    assert target.get_resource("chunk1").read_rows() == [
        {
            "id": 1,
            "name": "english"
        },
        {
            "id": 2,
            "name": "中国人"
        },
    ]
Exemple #2
0
def test_describe_with_stats():
    resource = describe("data/table.csv", stats=True)
    assert resource.metadata_valid
    assert resource == {
        "profile": "tabular-data-resource",
        "name": "table",
        "path": "data/table.csv",
        "scheme": "file",
        "format": "csv",
        "hashing": "md5",
        "encoding": "utf-8",
        "schema": {
            "fields": [
                {
                    "name": "id",
                    "type": "integer"
                },
                {
                    "name": "name",
                    "type": "string"
                },
            ]
        },
        "stats": {
            "hash": "6c2c61dd9b0e9c6876139a449ed87933",
            "bytes": 30,
            "fields": 2,
            "rows": 2,
        },
    }
Exemple #3
0
def test_describe_resource():
    resource = describe("data/table.csv")
    assert resource.metadata_valid
    assert resource == {
        "profile": "tabular-data-resource",
        "name": "table",
        "path": "data/table.csv",
        "scheme": "file",
        "format": "csv",
        "hashing": "md5",
        "encoding": "utf-8",
        "compression": "no",
        "compressionPath": "",
        "control": {"newline": ""},
        "dialect": {},
        "query": {},
        "schema": {
            "fields": [
                {"name": "id", "type": "integer"},
                {"name": "name", "type": "string"},
            ]
        },
        "stats": {
            "hash": "6c2c61dd9b0e9c6876139a449ed87933",
            "bytes": 30,
            "fields": 2,
            "rows": 2,
        },
    }
def test_describe_package():
    package = describe("data/chunk*.csv")
    assert package.metadata_valid
    assert package == {
        "profile": "data-package",
        "resources": [
            {
                "path": "data/chunk1.csv",
                "profile": "tabular-data-resource",
                "name": "chunk1",
                "scheme": "file",
                "format": "csv",
                "hashing": "md5",
                "encoding": "utf-8",
                "innerpath": "",
                "compression": "",
                "control": {"newline": ""},
                "dialect": {},
                "layout": {},
                "schema": {
                    "fields": [
                        {"name": "id", "type": "integer"},
                        {"name": "name", "type": "string"},
                    ]
                },
                "stats": {
                    "hash": "8fff9d97e5c0cb77b7c469ec37c8e766",
                    "bytes": 18,
                    "fields": 2,
                    "rows": 1,
                },
            },
            {
                "path": "data/chunk2.csv",
                "profile": "tabular-data-resource",
                "name": "chunk2",
                "scheme": "file",
                "format": "csv",
                "hashing": "md5",
                "encoding": "utf-8",
                "innerpath": "",
                "compression": "",
                "control": {"newline": ""},
                "dialect": {},
                "layout": {},
                "schema": {
                    "fields": [
                        {"name": "id", "type": "integer"},
                        {"name": "name", "type": "string"},
                    ]
                },
                "stats": {
                    "hash": "ebfa07d04a148a92a18078f78468694d",
                    "bytes": 20,
                    "fields": 2,
                    "rows": 1,
                },
            },
        ],
    }
Exemple #5
0
def fetch_and_read(resource_dict, site_url, api_key):
    """
    Fetch and read source type, metadata and schema from
    ckan resource URl.
    """
    logging.info('Fetching resource data from url')
    try:
        resource = describe(resource_dict['path'], type="resource")
        status_dict = {
            'res_id': resource_dict['ckan_resource_id'],
            'state': 'progress',
            'message':
            'Fetching datafile from {0}.'.format(resource_dict['path']),
        }
        aircan_status_update(site_url, api_key, status_dict)
        return {'sucess': True, 'resource': resource}

    except Exception as err:
        status_dict = {
            'res_id': resource_dict['ckan_resource_id'],
            'state': 'error',
            'message': 'Failed to fetch data file.',
            'error': str(err)
        }
        aircan_status_update(site_url, api_key, status_dict)
        return {"success": False, "errors": [err]}
def test_describe_package_with_glob_having_one_incorrect_dialect_1126():
    package = describe("data/country-*.csv",
                       dialect={"delimiter": ","},
                       type="package")
    resource_1 = package.get_resource("country-1")
    resource_2 = package.get_resource("country-2")
    assert resource_1["schema"] == {
        "fields": [
            {
                "type": "integer",
                "name": "id"
            },
            {
                "type": "integer",
                "name": "neighbor_id"
            },
            {
                "type": "string",
                "name": "name"
            },
            {
                "type": "integer",
                "name": "population"
            },
        ]
    }
    assert resource_2["schema"] == {
        "fields": [{
            "type": "string",
            "name": "# Author: the scientist"
        }]
    }
def test_push_blob(vcr, sample_file, client):
    resource = describe(sample_file)
    with vcr('test_push_blob.yaml'):
        result = client.push_blob(resource)

    assert result is not None
    assert result['fileExists']
def test_program_describe_infer_missing_values():
    result = runner.invoke(
        program, "describe data/table.csv --json --field-missing-values 1"
    )
    assert result.exit_code == 0
    assert json.loads(result.stdout) == describe(
        "data/table.csv", detector=Detector(field_missing_values=["1"])
    )
def test_program_describe_header_join():
    result = runner.invoke(
        program, "describe data/table.csv --json --header-rows '1,2' --header-join ':'"
    )
    assert result.exit_code == 0
    assert json.loads(result.stdout) == describe(
        "data/table.csv", layout={"headerRows": [1, 2], "headerJoin": ":"}
    )
Exemple #10
0
def test_describe_package_hashing():
    package = describe("data/chunk*.csv", hashing="sha256")
    assert package.get_resource("chunk1").hashing == "sha256"
    assert package.get_resource("chunk2").hashing == "sha256"
    assert (package.get_resource("chunk1").stats["hash"] ==
            "3872c98bd72eb4a91ac666f7758cd83da904c61a35178ca1ce9e10d6b009cd21")
    assert (package.get_resource("chunk2").stats["hash"] ==
            "556e92cdacfc46c2338ab0b88daf9d560c6760eac2d4cb6f7df589c108fc07ce")
Exemple #11
0
def test_describe_resource_schema_infer_volume():
    resource = describe("data/table-infer-row-limit.csv", infer_volume=4)
    assert resource.schema == {
        "fields": [
            {"name": "id", "type": "integer"},
            {"name": "age", "type": "integer"},
            {"name": "name", "type": "string"},
        ],
    }
Exemple #12
0
def test_describe_resource_schema_with_missing_values_default():
    resource = describe("data/table-infer-missing-values.csv")
    assert resource.schema == {
        "fields": [
            {"name": "id", "type": "string"},
            {"name": "age", "type": "integer"},
            {"name": "name", "type": "string"},
        ],
    }
Exemple #13
0
def test_describe_resource_schema_xlsx_file_with_boolean_column_issue_203():
    resource = describe("data/table-infer-boolean.xlsx")
    assert resource.schema == {
        "fields": [
            {"name": "number", "type": "integer"},
            {"name": "string", "type": "string"},
            {"name": "boolean", "type": "boolean"},
        ],
    }
Exemple #14
0
def test_describe_resource_values_with_leading_zeros_issue_492():
    resource = describe("data/leading-zeros.csv")
    assert resource.schema == {
        "fields": [{
            "name": "value",
            "type": "integer"
        }]
    }
    assert resource.read_rows() == [{"value": 1}, {"value": 2}, {"value": 3}]
Exemple #15
0
def test_describe_whitespace_cells_issue_7():
    source = "header1,header2\n1, \n2, \n3, \n"
    resource = describe(source, scheme="text", format="csv")
    assert resource.schema == {
        "fields": [
            {"name": "header1", "type": "integer"},
            {"name": "header2", "type": "string"},
        ]
    }
Exemple #16
0
def test_describe_resource_schema_utf8():
    resource = describe("data/table-infer-utf8.csv")
    assert resource.schema == {
        "fields": [
            {"name": "id", "type": "integer"},
            {"name": "age", "type": "integer"},
            {"name": "name", "type": "string"},
        ],
    }
def test_validate_less_actual_fields_with_required_constraint_issue_950():
    schema = describe("data/table.csv", type="schema")
    schema.add_field(Field(name="bad", constraints={"required": True}))
    report = validate("data/table.csv", schema=schema)
    assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [
        [None, 3, "missing-label"],
        [2, 3, "missing-cell"],
        [3, 3, "missing-cell"],
    ]
Exemple #18
0
def test_describe_whitespace_cells_with_skip_initial_space_issue_7():
    source = "header1,header2\n1, \n2, \n3, \n"
    dialect = CsvDialect(skip_initial_space=True)
    resource = describe(source, scheme="text", format="csv", dialect=dialect)
    assert resource.schema == {
        "fields": [
            {"name": "header1", "type": "integer"},
            {"name": "header2", "type": "any"},
        ]
    }
def test_describe_package_with_incorrect_dialect_1126():
    package = describe("data/country-2.csv",
                       dialect={"delimiter": ","},
                       type="package")
    assert package.get_resource("country-2")["schema"] == {
        "fields": [{
            "type": "string",
            "name": "# Author: the scientist"
        }]
    }
Exemple #20
0
def test_describe_resource_schema_with_missing_values_using_the_argument():
    resource = describe("data/table-infer-missing-values.csv", infer_missing_values=["-"])
    assert resource.schema == {
        "fields": [
            {"name": "id", "type": "integer"},
            {"name": "age", "type": "integer"},
            {"name": "name", "type": "string"},
        ],
        "missingValues": ["-"],
    }
Exemple #21
0
def test_describe_resource_schema_expand():
    resource = describe("data/table-infer.csv", expand=True)
    assert resource.schema == {
        "fields": [
            {"name": "id", "type": "integer", "format": "default", "bareNumber": True},
            {"name": "age", "type": "integer", "format": "default", "bareNumber": True},
            {"name": "name", "type": "string", "format": "default"},
        ],
        "missingValues": [""],
    }
Exemple #22
0
def test_describe_non_tabular_html_issue_715():
    resource = describe("data/text.html")
    assert resource == {
        "path": "data/text.html",
        "name": "text",
        "profile": "data-resource",
        "scheme": "file",
        "format": "html",
        "hashing": "md5",
        "encoding": "utf-8",
    }
def test_describe_resource_schema_increase_limit_issue_212():
    resource = describe("data/table-infer-increase-limit.csv",
                        infer_volume=200)
    assert resource.schema == {
        "fields": [{
            "name": "a",
            "type": "integer"
        }, {
            "name": "b",
            "type": "number"
        }],
    }
def test_describe_package():
    package = describe("data/tables/chunk*.csv")
    assert package.metadata_valid
    if IS_UNIX:
        assert package == {
            "profile":
            "data-package",
            "resources": [
                {
                    "path": "data/tables/chunk1.csv",
                    "profile": "tabular-data-resource",
                    "name": "chunk1",
                    "scheme": "file",
                    "format": "csv",
                    "hashing": "md5",
                    "encoding": "utf-8",
                    "schema": {
                        "fields": [
                            {
                                "name": "id",
                                "type": "integer"
                            },
                            {
                                "name": "name",
                                "type": "string"
                            },
                        ]
                    },
                },
                {
                    "path": "data/tables/chunk2.csv",
                    "profile": "tabular-data-resource",
                    "name": "chunk2",
                    "scheme": "file",
                    "format": "csv",
                    "hashing": "md5",
                    "encoding": "utf-8",
                    "schema": {
                        "fields": [
                            {
                                "name": "id",
                                "type": "integer"
                            },
                            {
                                "name": "name",
                                "type": "string"
                            },
                        ]
                    },
                },
            ],
        }
Exemple #25
0
def test_describe_resource_schema_increase_limit_issue_212():
    detector = Detector(sample_size=200)
    resource = describe("data/table-infer-increase-limit.csv",
                        detector=detector)
    assert resource.schema == {
        "fields": [{
            "name": "a",
            "type": "integer"
        }, {
            "name": "b",
            "type": "number"
        }],
    }
Exemple #26
0
def test_describe_non_tabular_resource_issue_641():
    resource = describe("data/document.pdf", stats=True)
    assert resource == {
        "path": "data/document.pdf",
        "name": "document",
        "profile": "data-resource",
        "scheme": "file",
        "format": "pdf",
        "hashing": "md5",
        "encoding": "utf-8",
        "stats": {
            "hash": "3a503daaa773a3ea32b1fedd9fece844",
            "bytes": 262443,
        },
    }
Exemple #27
0
def test_describe_blank_cells_issue_7():
    source = b"header1,header2\n1,\n2,\n3,\n"
    resource = describe(source, format="csv")
    assert resource.schema == {
        "fields": [
            {
                "name": "header1",
                "type": "integer"
            },
            {
                "name": "header2",
                "type": "any"
            },
        ]
    }
Exemple #28
0
def test_describe_resource():
    resource = describe("data/table.csv")
    assert resource.metadata_valid
    assert resource == {
        "profile": "tabular-data-resource",
        "name": "table",
        "path": "data/table.csv",
        "scheme": "file",
        "format": "csv",
        "hashing": "md5",
        "encoding": "utf-8",
        "schema": {
            "fields": [
                {"name": "id", "type": "integer"},
                {"name": "name", "type": "string"},
            ]
        },
    }
def test_describe_package_with_dialect_path_1126():
    package = describe("data/country-2.csv",
                       dialect="data/dialect.json",
                       type="package")
    assert package.get_resource("country-2")["schema"] == {
        "fields": [
            {
                "type": "integer",
                "name": "id"
            },
            {
                "type": "integer",
                "name": "neighbor_id"
            },
            {
                "type": "string",
                "name": "name"
            },
            {
                "type": "integer",
                "name": "population"
            },
        ]
    }
Exemple #30
0
def test_describe_package_basepath():
    package = describe("chunk*.csv", basepath="data")
    assert package.get_resource("chunk1").path == "chunk1.csv"
    assert package.get_resource("chunk2").path == "chunk2.csv"
    assert package.get_resource("chunk1").basepath == "data"
    assert package.get_resource("chunk2").basepath == "data"