def test_xlsx_parser_merged_cells_boolean(): source = "data/merged-cells-boolean.xls" layout = Layout(header=False) with Resource(source, layout=layout) as resource: assert resource.read_rows() == [ { "field1": True, "field2": None }, { "field1": None, "field2": None }, { "field1": None, "field2": None }, ]
def test_csv_parser_escaping(): dialect = CsvDialect(escape_char="\\") with Resource("data/escaping.csv", dialect=dialect) as resource: assert resource.header == ["ID", "Test"] assert resource.read_rows() == [ { "ID": 1, "Test": "Test line 1" }, { "ID": 2, "Test": 'Test " line 2' }, { "ID": 3, "Test": 'Test " line 3' }, ]
def test_inline_parser_from_generator_not_callable(): def generator(): yield ["id", "name"] yield ["1", "english"] yield ["2", "中国人"] with Resource(generator()) as resource: assert resource.header == ["id", "name"] assert resource.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_xlsx_parser_preserve_formatting_percentage(): source = "data/preserve-formatting-percentage.xlsx" dialect = ExcelDialect(preserve_formatting=True) with Resource(source, dialect=dialect) as resource: assert resource.read_rows() == [ { "col1": 123, "col2": "52.00%" }, { "col1": 456, "col2": "30.00%" }, { "col1": 789, "col2": "6.00%" }, ]
def test_step_row_sort_with_reverse_in_desriptor_issue_996(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_sort({ "fieldNames": ["id"], "reverse": True }), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 3, "name": "spain", "population": 47 }, { "id": 2, "name": "france", "population": 66 }, { "id": 1, "name": "germany", "population": 83 }, ]
def test_step_table_recast(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_melt(field_name="id"), steps.table_recast(field_name="id"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_cell_fill_direction_left(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", type="string"), steps.field_update(name="population", type="string"), steps.cell_replace(pattern="france", replace=None), steps.cell_fill(direction="left"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "string" }, { "name": "name", "type": "string" }, { "name": "population", "type": "string" }, ] } assert target.read_rows() == [ { "id": "1", "name": "germany", "population": "83" }, { "id": "2", "name": "66", "population": "66" }, { "id": "3", "name": "spain", "population": "47" }, ]
def test_step_cell_replace_using_regex(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.cell_replace(pattern="<regex>.*r.*", replace="center", field_name="name"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "center", "population": 83 }, { "id": 2, "name": "center", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_cell_replace_with_field_name(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.cell_replace(pattern="france", replace="FRANCE", field_name="id"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_validate_custom_check(): # Create check class custom(Check): def validate_row(self, row): yield errors.BlankRowError( note="", cells=list(map(str, row.values())), row_number=row.row_number, row_position=row.row_position, ) # Validate resource resource = Resource("data/table.csv") report = resource.validate(checks=[custom()]) assert report.flatten(["rowPosition", "fieldPosition", "code"]) == [ [2, None, "blank-row"], [3, None, "blank-row"], ]
def test_resource_infer_source_non_tabular(): resource = Resource(path="data/text.txt") resource.infer() print(resource.metadata_errors) assert resource.metadata_valid assert resource == { "name": "text", "path": "data/text.txt", "hash": "e1cbb0c3879af8347246f12c559a86b5", "bytes": 5, "rows": 0, "profile": "data-resource", "scheme": "file", "format": "txt", "hashing": "md5", "encoding": "utf-8", "compression": "no", "compressionPath": "", }
def test_step_cell_convert(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", type="string"), steps.field_update(name="population", type="string"), steps.cell_convert(value="n/a"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "string" }, { "name": "name", "type": "string" }, { "name": "population", "type": "string" }, ] } assert target.read_rows() == [ { "id": "n/a", "name": "n/a", "population": "n/a" }, { "id": "n/a", "name": "n/a", "population": "n/a" }, { "id": "n/a", "name": "n/a", "population": "n/a" }, ]
def test_step_field_remove(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_remove(names=["id"]), ], ) assert target.schema == { "fields": [ {"name": "name", "type": "string"}, {"name": "population", "type": "integer"}, ] } assert target.read_rows() == [ {"name": "germany", "population": 83}, {"name": "france", "population": 66}, {"name": "spain", "population": 47}, ]
def test_step_field_filter(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_filter(names=["id", "name"]), ], ) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, ] } assert target.read_rows() == [ {"id": 1, "name": "germany"}, {"id": 2, "name": "france"}, {"id": 3, "name": "spain"}, ]
def test_table_format_tsv(): detector = Detector(schema_patch={"missingValues": ["\\N"]}) with Resource("data/table.tsv", detector=detector) as resource: assert resource.dialect == {"delimiter": "\t"} assert resource.header == ["id", "name"] assert resource.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, { "id": 3, "name": None }, ]
def test_xls_parser_merged_cells_fill(): source = "data/merged-cells.xls" dialect = ExcelDialect(fill_merged_cells=True) layout = Layout(header=False) with Resource(source, dialect=dialect, layout=layout) as resource: assert resource.read_rows() == [ { "field1": "data", "field2": "data" }, { "field1": "data", "field2": "data" }, { "field1": "data", "field2": "data" }, ]
def test_analyze_resource_detailed_numeric_descriptive_statistics_with_missingValues( ): resource = Resource({"path": "data/analysis-data.csv"}) analysis = resource.analyze(detailed=True) assert analysis["fieldStats"]["average_grades"]["bounds"] == [81, 96] assert analysis["fieldStats"]["average_grades"]["max"] == 10000.0 assert analysis["fieldStats"]["average_grades"]["mean"] == 1503.28 assert analysis["fieldStats"]["average_grades"]["median"] == 86.91 assert analysis["fieldStats"]["average_grades"]["min"] == 84.65 assert analysis["fieldStats"]["average_grades"]["missingValues"] == 2 assert analysis["fieldStats"]["average_grades"]["mode"] == 86.79 assert analysis["fieldStats"]["average_grades"]["quantiles"] == [ 86.79, 86.91, 90.39 ] assert round(analysis["fieldStats"]["average_grades"]["stdev"]) == 3747 assert analysis["fieldStats"]["average_grades"]["uniqueValues"] == 6 assert round( analysis["fieldStats"]["average_grades"]["variance"]) == 14037774 assert analysis["fieldStats"]["average_grades"]["outliers"] == [10000.0]
def test_resource_source_multipart(): descriptor = { "path": ["chunk1.csv", "chunk2.csv"], "schema": "resource-schema.json", } resource = Resource(descriptor, basepath="data") assert resource.inline is False assert resource.multipart is True assert resource.tabular is True assert resource.read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_sql_parser_headers_false(database_url): dialect = SqlDialect(table="table") layout = Layout(header=False) with Resource(database_url, dialect=dialect, layout=layout) as resource: assert resource.header == ["id", "name"] assert resource.read_rows() == [ { "id": None, "name": "name" }, { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_step_cell_interpolate(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", type="string"), steps.field_update(name="population", type="string"), steps.cell_interpolate(template="Prefix: %s"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "string" }, { "name": "name", "type": "string" }, { "name": "population", "type": "string" }, ] } assert target.read_rows() == [ { "id": "Prefix: 1", "name": "Prefix: germany", "population": "Prefix: 83" }, { "id": "Prefix: 2", "name": "Prefix: france", "population": "Prefix: 66" }, { "id": "Prefix: 3", "name": "Prefix: spain", "population": "Prefix: 47" }, ]
def test_step_row_subset_distinct_with_duplicates(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", value=1), steps.row_subset(subset="distinct", field_name="id"), ], ) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "population", "type": "integer"}, ] } assert target.read_rows() == [ {"id": 1, "name": "germany", "population": 83}, ]
def test_step_table_melt_with_to_field_names(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_melt(field_name="name", variables=["population"], to_field_names=["key", "val"]), ], ) assert target.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "key" }, { "name": "val" }, ] } assert target.read_rows() == [ { "name": "germany", "key": "population", "val": 83 }, { "name": "france", "key": "population", "val": 66 }, { "name": "spain", "key": "population", "val": 47 }, ]
def test_step_field_pack_object_907(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.field_pack( name="details", from_names=["name", "population"], field_type="object", preserve=True, ) ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, { "name": "details", "type": "object" }, ] } assert target.read_rows()[0] == { "id": 1, "name": "germany", "population": 83, "details": { "name": "germany", "population": "83" }, }
def test_step_row_filter_petl_selectisinstance(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(function=lambda row: isinstance(row["id"], int)), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_split(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.row_split(field_name="name", pattern="a"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germ", "population": 83 }, { "id": 1, "name": "ny", "population": 83 }, { "id": 2, "name": "fr", "population": 66 }, { "id": 2, "name": "nce", "population": 66 }, { "id": 3, "name": "sp", "population": 47 }, { "id": 3, "name": "in", "population": 47 }, ]
def test_step_table_intersect_from_dict(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_intersect(resource=dict(data=[ ["id", "name", "population"], [1, "germany", 83], [2, "france", 50], [3, "spain", 47], ])), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_resource_layout_header_xlsx_multiline(): source = "data/multiline-headers.xlsx" dialect = ExcelDialect(fill_merged_cells=True) layout = Layout(header_rows=[1, 2, 3, 4, 5]) with Resource(source, dialect=dialect, layout=layout) as resource: header = resource.header assert header == [ "Region", "Caloric contribution (%)", "Cumulative impact of changes on cost of food basket from previous quarter", "Cumulative impact of changes on cost of food basket from baseline (%)", ] assert resource.read_rows() == [ { header[0]: "A", header[1]: "B", header[2]: "C", header[3]: "D" }, ]
def test_resource_layout_header_inline_keyed_headers_is_none(): source = [{"id": "1", "name": "english"}, {"id": "2", "name": "中国人"}] layout = Layout(header=False) with Resource(source, layout=layout) as resource: assert resource.labels == [] assert resource.header == ["field1", "field2"] assert resource.read_rows() == [ { "field1": "id", "field2": "name" }, { "field1": "1", "field2": "english" }, { "field1": "2", "field2": "中国人" }, ]
def test_resource_open_read_rows(): with Resource("data/table.csv") as resource: headers = resource.header row1, row2 = resource.read_rows() assert headers == ["id", "name"] assert headers.field_positions == [1, 2] assert headers.errors == [] assert headers.valid is True assert row1.to_dict() == {"id": 1, "name": "english"} assert row1.field_positions == [1, 2] assert row1.row_position == 2 assert row1.row_number == 1 assert row1.errors == [] assert row1.valid is True assert row2.to_dict() == {"id": 2, "name": "中国人"} assert row2.field_positions == [1, 2] assert row2.row_position == 3 assert row2.row_number == 2 assert row2.errors == [] assert row2.valid is True
def test_step_row_subset_distinct(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_subset(subset="distinct", field_name="id"), ], ) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "name", "type": "string"}, {"name": "population", "type": "integer"}, ] } assert target.read_rows() == [ {"id": 1, "name": "germany", "population": 83}, {"id": 2, "name": "france", "population": 66}, {"id": 3, "name": "spain", "population": 47}, ]