def test_transform_package(): source = describe("data/chunk*.csv") target = transform( source, steps=[ steps.resource_transform( name="chunk1", steps=[ steps.table_merge(resource="chunk2"), ], ), steps.resource_remove(name="chunk2"), ], ) assert target.resource_names == ["chunk1"] assert target.get_resource("chunk1").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_describe_with_stats(): resource = describe("data/table.csv", stats=True) assert resource.metadata_valid assert resource == { "profile": "tabular-data-resource", "name": "table", "path": "data/table.csv", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "schema": { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ] }, "stats": { "hash": "6c2c61dd9b0e9c6876139a449ed87933", "bytes": 30, "fields": 2, "rows": 2, }, }
def test_describe_resource(): resource = describe("data/table.csv") assert resource.metadata_valid assert resource == { "profile": "tabular-data-resource", "name": "table", "path": "data/table.csv", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "compression": "no", "compressionPath": "", "control": {"newline": ""}, "dialect": {}, "query": {}, "schema": { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, ] }, "stats": { "hash": "6c2c61dd9b0e9c6876139a449ed87933", "bytes": 30, "fields": 2, "rows": 2, }, }
def test_describe_package(): package = describe("data/chunk*.csv") assert package.metadata_valid assert package == { "profile": "data-package", "resources": [ { "path": "data/chunk1.csv", "profile": "tabular-data-resource", "name": "chunk1", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "innerpath": "", "compression": "", "control": {"newline": ""}, "dialect": {}, "layout": {}, "schema": { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, ] }, "stats": { "hash": "8fff9d97e5c0cb77b7c469ec37c8e766", "bytes": 18, "fields": 2, "rows": 1, }, }, { "path": "data/chunk2.csv", "profile": "tabular-data-resource", "name": "chunk2", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "innerpath": "", "compression": "", "control": {"newline": ""}, "dialect": {}, "layout": {}, "schema": { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, ] }, "stats": { "hash": "ebfa07d04a148a92a18078f78468694d", "bytes": 20, "fields": 2, "rows": 1, }, }, ], }
def fetch_and_read(resource_dict, site_url, api_key): """ Fetch and read source type, metadata and schema from ckan resource URl. """ logging.info('Fetching resource data from url') try: resource = describe(resource_dict['path'], type="resource") status_dict = { 'res_id': resource_dict['ckan_resource_id'], 'state': 'progress', 'message': 'Fetching datafile from {0}.'.format(resource_dict['path']), } aircan_status_update(site_url, api_key, status_dict) return {'sucess': True, 'resource': resource} except Exception as err: status_dict = { 'res_id': resource_dict['ckan_resource_id'], 'state': 'error', 'message': 'Failed to fetch data file.', 'error': str(err) } aircan_status_update(site_url, api_key, status_dict) return {"success": False, "errors": [err]}
def test_describe_package_with_glob_having_one_incorrect_dialect_1126(): package = describe("data/country-*.csv", dialect={"delimiter": ","}, type="package") resource_1 = package.get_resource("country-1") resource_2 = package.get_resource("country-2") assert resource_1["schema"] == { "fields": [ { "type": "integer", "name": "id" }, { "type": "integer", "name": "neighbor_id" }, { "type": "string", "name": "name" }, { "type": "integer", "name": "population" }, ] } assert resource_2["schema"] == { "fields": [{ "type": "string", "name": "# Author: the scientist" }] }
def test_push_blob(vcr, sample_file, client): resource = describe(sample_file) with vcr('test_push_blob.yaml'): result = client.push_blob(resource) assert result is not None assert result['fileExists']
def test_program_describe_infer_missing_values(): result = runner.invoke( program, "describe data/table.csv --json --field-missing-values 1" ) assert result.exit_code == 0 assert json.loads(result.stdout) == describe( "data/table.csv", detector=Detector(field_missing_values=["1"]) )
def test_program_describe_header_join(): result = runner.invoke( program, "describe data/table.csv --json --header-rows '1,2' --header-join ':'" ) assert result.exit_code == 0 assert json.loads(result.stdout) == describe( "data/table.csv", layout={"headerRows": [1, 2], "headerJoin": ":"} )
def test_describe_package_hashing(): package = describe("data/chunk*.csv", hashing="sha256") assert package.get_resource("chunk1").hashing == "sha256" assert package.get_resource("chunk2").hashing == "sha256" assert (package.get_resource("chunk1").stats["hash"] == "3872c98bd72eb4a91ac666f7758cd83da904c61a35178ca1ce9e10d6b009cd21") assert (package.get_resource("chunk2").stats["hash"] == "556e92cdacfc46c2338ab0b88daf9d560c6760eac2d4cb6f7df589c108fc07ce")
def test_describe_resource_schema_infer_volume(): resource = describe("data/table-infer-row-limit.csv", infer_volume=4) assert resource.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "age", "type": "integer"}, {"name": "name", "type": "string"}, ], }
def test_describe_resource_schema_with_missing_values_default(): resource = describe("data/table-infer-missing-values.csv") assert resource.schema == { "fields": [ {"name": "id", "type": "string"}, {"name": "age", "type": "integer"}, {"name": "name", "type": "string"}, ], }
def test_describe_resource_schema_xlsx_file_with_boolean_column_issue_203(): resource = describe("data/table-infer-boolean.xlsx") assert resource.schema == { "fields": [ {"name": "number", "type": "integer"}, {"name": "string", "type": "string"}, {"name": "boolean", "type": "boolean"}, ], }
def test_describe_resource_values_with_leading_zeros_issue_492(): resource = describe("data/leading-zeros.csv") assert resource.schema == { "fields": [{ "name": "value", "type": "integer" }] } assert resource.read_rows() == [{"value": 1}, {"value": 2}, {"value": 3}]
def test_describe_whitespace_cells_issue_7(): source = "header1,header2\n1, \n2, \n3, \n" resource = describe(source, scheme="text", format="csv") assert resource.schema == { "fields": [ {"name": "header1", "type": "integer"}, {"name": "header2", "type": "string"}, ] }
def test_describe_resource_schema_utf8(): resource = describe("data/table-infer-utf8.csv") assert resource.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "age", "type": "integer"}, {"name": "name", "type": "string"}, ], }
def test_validate_less_actual_fields_with_required_constraint_issue_950(): schema = describe("data/table.csv", type="schema") schema.add_field(Field(name="bad", constraints={"required": True})) report = validate("data/table.csv", schema=schema) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [None, 3, "missing-label"], [2, 3, "missing-cell"], [3, 3, "missing-cell"], ]
def test_describe_whitespace_cells_with_skip_initial_space_issue_7(): source = "header1,header2\n1, \n2, \n3, \n" dialect = CsvDialect(skip_initial_space=True) resource = describe(source, scheme="text", format="csv", dialect=dialect) assert resource.schema == { "fields": [ {"name": "header1", "type": "integer"}, {"name": "header2", "type": "any"}, ] }
def test_describe_package_with_incorrect_dialect_1126(): package = describe("data/country-2.csv", dialect={"delimiter": ","}, type="package") assert package.get_resource("country-2")["schema"] == { "fields": [{ "type": "string", "name": "# Author: the scientist" }] }
def test_describe_resource_schema_with_missing_values_using_the_argument(): resource = describe("data/table-infer-missing-values.csv", infer_missing_values=["-"]) assert resource.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "age", "type": "integer"}, {"name": "name", "type": "string"}, ], "missingValues": ["-"], }
def test_describe_resource_schema_expand(): resource = describe("data/table-infer.csv", expand=True) assert resource.schema == { "fields": [ {"name": "id", "type": "integer", "format": "default", "bareNumber": True}, {"name": "age", "type": "integer", "format": "default", "bareNumber": True}, {"name": "name", "type": "string", "format": "default"}, ], "missingValues": [""], }
def test_describe_non_tabular_html_issue_715(): resource = describe("data/text.html") assert resource == { "path": "data/text.html", "name": "text", "profile": "data-resource", "scheme": "file", "format": "html", "hashing": "md5", "encoding": "utf-8", }
def test_describe_resource_schema_increase_limit_issue_212(): resource = describe("data/table-infer-increase-limit.csv", infer_volume=200) assert resource.schema == { "fields": [{ "name": "a", "type": "integer" }, { "name": "b", "type": "number" }], }
def test_describe_package(): package = describe("data/tables/chunk*.csv") assert package.metadata_valid if IS_UNIX: assert package == { "profile": "data-package", "resources": [ { "path": "data/tables/chunk1.csv", "profile": "tabular-data-resource", "name": "chunk1", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "schema": { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ] }, }, { "path": "data/tables/chunk2.csv", "profile": "tabular-data-resource", "name": "chunk2", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "schema": { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ] }, }, ], }
def test_describe_resource_schema_increase_limit_issue_212(): detector = Detector(sample_size=200) resource = describe("data/table-infer-increase-limit.csv", detector=detector) assert resource.schema == { "fields": [{ "name": "a", "type": "integer" }, { "name": "b", "type": "number" }], }
def test_describe_non_tabular_resource_issue_641(): resource = describe("data/document.pdf", stats=True) assert resource == { "path": "data/document.pdf", "name": "document", "profile": "data-resource", "scheme": "file", "format": "pdf", "hashing": "md5", "encoding": "utf-8", "stats": { "hash": "3a503daaa773a3ea32b1fedd9fece844", "bytes": 262443, }, }
def test_describe_blank_cells_issue_7(): source = b"header1,header2\n1,\n2,\n3,\n" resource = describe(source, format="csv") assert resource.schema == { "fields": [ { "name": "header1", "type": "integer" }, { "name": "header2", "type": "any" }, ] }
def test_describe_resource(): resource = describe("data/table.csv") assert resource.metadata_valid assert resource == { "profile": "tabular-data-resource", "name": "table", "path": "data/table.csv", "scheme": "file", "format": "csv", "hashing": "md5", "encoding": "utf-8", "schema": { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, ] }, }
def test_describe_package_with_dialect_path_1126(): package = describe("data/country-2.csv", dialect="data/dialect.json", type="package") assert package.get_resource("country-2")["schema"] == { "fields": [ { "type": "integer", "name": "id" }, { "type": "integer", "name": "neighbor_id" }, { "type": "string", "name": "name" }, { "type": "integer", "name": "population" }, ] }
def test_describe_package_basepath(): package = describe("chunk*.csv", basepath="data") assert package.get_resource("chunk1").path == "chunk1.csv" assert package.get_resource("chunk2").path == "chunk2.csv" assert package.get_resource("chunk1").basepath == "data" assert package.get_resource("chunk2").basepath == "data"