def test_xlsx_parser_merged_cells_boolean(): source = "data/merged-cells-boolean.xls" layout = Layout(header=False) with Resource(source, layout=layout) as resource: assert resource.read_rows() == [ { "field1": True, "field2": None }, { "field1": None, "field2": None }, { "field1": None, "field2": None }, ]
def test_ods_parser_write(tmpdir): source = Resource("data/table.csv") # NOTE: ezodf writer creates more cells than we ask (remove limits) layout = Layout(limit_fields=2, limit_rows=2) target = Resource(str(tmpdir.join("table.ods")), layout=layout) source.write(target) with target: assert target.header == ["id", "name"] assert target.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_sql_parser_headers_false(database_url): dialect = SqlDialect(table="table") layout = Layout(header=False) with Resource(database_url, dialect=dialect, layout=layout) as resource: assert resource.header == ["id", "name"] assert resource.read_rows() == [ { "id": None, "name": "name" }, { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_xls_parser_merged_cells_fill(): source = "data/merged-cells.xls" dialect = ExcelDialect(fill_merged_cells=True) layout = Layout(header=False) with Resource(source, dialect=dialect, layout=layout) as resource: assert resource.read_rows() == [ { "field1": "data", "field2": "data" }, { "field1": "data", "field2": "data" }, { "field1": "data", "field2": "data" }, ]
def test_resource_layout_header_xlsx_multiline(): source = "data/multiline-headers.xlsx" dialect = ExcelDialect(fill_merged_cells=True) layout = Layout(header_rows=[1, 2, 3, 4, 5]) with Resource(source, dialect=dialect, layout=layout) as resource: header = resource.header assert header == [ "Region", "Caloric contribution (%)", "Cumulative impact of changes on cost of food basket from previous quarter", "Cumulative impact of changes on cost of food basket from baseline (%)", ] assert resource.read_rows() == [ { header[0]: "A", header[1]: "B", header[2]: "C", header[3]: "D" }, ]
def test_resource_layout_header_inline_keyed_headers_is_none(): source = [{"id": "1", "name": "english"}, {"id": "2", "name": "中国人"}] layout = Layout(header=False) with Resource(source, layout=layout) as resource: assert resource.labels == [] assert resource.header == ["field1", "field2"] assert resource.read_rows() == [ { "field1": "id", "field2": "name" }, { "field1": "1", "field2": "english" }, { "field1": "2", "field2": "中国人" }, ]
def test_resource_layout_skip_rows_regex(): source = [ ["# comment"], ["name", "order"], ["# cat"], ["# dog"], ["John", 1], ["Alex", 2], ] layout = Layout(skip_rows=["# comment", r"<regex># (cat|dog)"]) with Resource(source, layout=layout) as resource: assert resource.header == ["name", "order"] assert resource.read_rows() == [ { "name": "John", "order": 1 }, { "name": "Alex", "order": 2 }, ]
def test_xlsx_parser_preserve_formatting(): source = "data/preserve-formatting.xlsx" dialect = ExcelDialect(preserve_formatting=True) layout = Layout(header_rows=[1]) with Resource(source, dialect=dialect, layout=layout, infer_type="any") as resource: assert resource.read_rows() == [ { # general "empty": None, # numeric "0": "1001", "0.00": "1000.56", "0.0000": "1000.5577", "0.00000": "1000.55770", "0.0000#": "1000.5577", # temporal "m/d/yy": "5/20/40", "d-mmm": "20-May", "mm/dd/yy": "05/20/40", "mmddyy": "052040", "mmddyyam/pmdd": "052040AM20", } ]
def test_resource_layout_skip_rows_preset(): source = [ ["name", "order"], ["", ""], [], ["Ray", 0], ["John", 1], ["Alex", 2], ["", 3], [None, 4], ["", None], ] layout = Layout(skip_rows=["<blank>"]) with Resource(source, layout=layout) as resource: assert resource.header == ["name", "order"] assert resource.read_rows() == [ { "name": "Ray", "order": 0 }, { "name": "John", "order": 1 }, { "name": "Alex", "order": 2 }, { "name": None, "order": 3 }, { "name": None, "order": 4 }, ]
def test_validate_order_fields_issue_313(): source = "data/issue-313.xlsx" layout = Layout(pick_fields=[1, 2, 3, 4, 5]) schema = { "fields": [ { "name": "Column_1", "type": "string" }, { "name": "Column_2", "type": "string", "constraints": { "required": True } }, { "name": "Column_3", "type": "string" }, { "name": "Column_4", "type": "string" }, { "name": "Column_5", "type": "string" }, ] } detector = Detector(schema_sync=True) resource = Resource(source, layout=layout, schema=schema, detector=detector) report = resource.validate() assert report.valid
def test_validate_layout_number(): layout = Layout(header_rows=[2]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["11", "12", "13", "14"] assert report.valid
def test_package_resources_respect_layout_set_after_creation_issue_503(): package = Package(resources=[Resource(path="data/table.csv")]) resource = package.get_resource("table") resource.layout = Layout(limit_rows=1) assert resource.read_rows() == [{"id": 1, "name": "english"}] assert resource.header == ["id", "name"]
def test_validate_offset_rows(): layout = Layout(offset_rows=3) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f1", "f2", "f3", "f4"] assert report.task.resource.stats["rows"] == 1 assert report.task.valid
def test_validate_layout_structure_errors_with_limit_rows(): layout = Layout(limit_rows=3) report = validate("data/structure-errors.csv", layout=layout) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [4, None, "blank-row"], ]
def test_validate_layout_limit_and_offset_rows(): layout = Layout(limit_rows=2, offset_rows=1) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f1", "f2", "f3", "f4"] assert report.task.resource.stats["rows"] == 2 assert report.task.valid
def test_validate_layout_skip_rows_and_fields(): layout = Layout(skip_rows=[2, "41"], skip_fields=[1, "f4"]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f2", "f3"] assert report.task.resource.stats["rows"] == 2 assert report.task.valid
def test_validate_layout_skip_rows_blank(): layout = Layout(skip_rows=["<blank>"]) report = validate("data/blank-rows.csv", layout=layout) assert report.task.resource.header == ["id", "name", "age"] assert report.task.resource.stats["rows"] == 2 assert report.task.valid
def test_validate_layout_skip_rows_regex(): layout = Layout(skip_rows=["<regex>[14]1"]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f1", "f2", "f3", "f4"] assert report.task.resource.stats["rows"] == 2 assert report.task.valid
def test_validate_layout_pick_rows(): layout = Layout(pick_rows=[1, 3, "31"]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f1", "f2", "f3", "f4"] assert report.task.resource.stats["rows"] == 2 assert report.task.valid
def load_resource_via_api(resource_dict, ckan_api_key, ckan_site_url): logging.info("Loading resource via API lib") try: offset_rows = 0 row_chunk = 16384 # Push data untill records is empty while True: if offset_rows == 0: layout = Layout(limit_rows=row_chunk) else: layout = Layout(limit_rows=row_chunk, offset_rows=offset_rows) with Resource(resource_dict['path'], layout=layout) as resource: records = [ row.to_dict(json=True) for row in resource.row_stream ] if not records: status_dict = { 'res_id': resource_dict['ckan_resource_id'], 'state': 'complete', 'message': 'Successfully pushed {0} entries to "{1}"'.format( offset_rows, resource_dict['ckan_resource_id']) } aircan_status_update(ckan_site_url, ckan_api_key, status_dict) return {'success': True} else: offset_rows += len(records) payload = { 'resource_id': resource_dict['ckan_resource_id'], 'force': True, 'records': records, 'method': 'insert' } url = urljoin(ckan_site_url, '/api/3/action/datastore_upsert') response = requests.post( url, data=json.dumps(payload, cls=DatastoreEncoder), headers={ 'Content-Type': 'application/json', 'Authorization': ckan_api_key }) response.raise_for_status() if response.status_code == 200: status_dict = { 'res_id': resource_dict['ckan_resource_id'], 'state': 'complete', 'message': 'Pushed {0} entries of records.'.format( offset_rows) } aircan_status_update(ckan_site_url, ckan_api_key, status_dict) else: raise requests.HTTPError( 'Failed to make request on CKAN API.') except Exception as err: status_dict = { 'res_id': resource_dict['ckan_resource_id'], 'state': 'error', 'message': 'Failed to push data into datastore DB.', 'error': str(err) } aircan_status_update(ckan_site_url, ckan_api_key, status_dict) return {"success": False}
def test_validate_layout_list_of_numbers_and_headers_join(): layout = Layout(header_rows=[2, 3, 4], header_join=".") report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["11.21.31", "12.22.32", "13.23.33", "14.24.34"] assert report.valid
def test_validate_layout_list_of_numbers(): layout = Layout(header_rows=[2, 3, 4]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["11 21 31", "12 22 32", "13 23 33", "14 24 34"] assert report.valid
def test_resource_stats_rows_significant(): layout = Layout(header=False) with Resource("data/table-1MB.csv", layout=layout) as resource: print(resource.read_rows()) assert resource.stats["rows"] == 10000
def test_validate_layout_pick_fields_regex(): layout = Layout(pick_fields=["<regex>f[23]"]) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f2", "f3"] assert report.task.resource.stats["rows"] == 4 assert report.task.valid
def test_validate_layout_offset_fields(): layout = Layout(offset_fields=3) report = validate("data/matrix.csv", layout=layout) assert report.task.resource.header == ["f4"] assert report.task.resource.stats["rows"] == 4 assert report.task.valid
def test_resource_layout_respect_set_after_creation_issue_503(): resource = Resource(path="data/table.csv") resource.layout = Layout(limit_rows=1) assert resource.read_rows() == [{"id": 1, "name": "english"}] assert resource.header == ["id", "name"]
def test_describe_resource_schema_check_type_boolean_string_tie(): layout = Layout(header=False) detector = Detector(field_names=["field"]) resource = describe([["f"], ["stringish"]], layout=layout, detector=detector) assert resource.schema.get_field("field").type == "string"