def test_transform_rename_move_field_issue_953(): target = transform( data=[ {"id": 1, "name": "germany", "population": 83}, {"id": 2, "name": "france", "population": 66}, {"id": 3, "name": "spain", "population": 47}, ], steps=[ steps.table_normalize(), steps.field_update(name="name", new_name="country"), steps.field_move(name="country", position=3), ], ) assert target.schema == { "fields": [ {"name": "id", "type": "integer"}, {"name": "population", "type": "integer"}, {"name": "country", "type": "string"}, ] } assert target.read_rows() == [ {"id": 1, "population": 83, "country": "germany"}, {"id": 2, "population": 66, "country": "france"}, {"id": 3, "population": 47, "country": "spain"}, ]
def test_step_resource_add(): source = Package("data/package/datapackage.json") source.infer() target = transform( source, steps=[ steps.resource_remove(name="data2"), steps.resource_add(name="data2", path="data2.csv"), ], ) assert target.resource_names == ["data", "data2"] assert target.get_resource("data2").read_rows() == [ { "parent": "A3001", "comment": "comment1" }, { "parent": "A3001", "comment": "comment2" }, { "parent": "A5032", "comment": "comment3" }, ]
def test_step_table_transpose(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_transpose(), ], ) assert target.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "germany", "type": "integer" }, { "name": "france", "type": "integer" }, { "name": "spain", "type": "integer" }, ] } assert target.read_rows() == [{ "name": "population", "germany": 83, "france": 66, "spain": 47 }]
def test_step_row_filter_petl_selectrangeclosed(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(formula="1 < id < 3"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_filter_petl_selectnone(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_filter(formula="id is None"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == []
def test_step_row_filter_petl_rowlenselect(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.row_filter(predicat=lambda row: len(row) == 3), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_subset_duplicates_with_name(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.field_update(name="id", value=1), steps.row_subset(subset="duplicates", field_name="id"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 1, "name": "france", "population": 66 }, { "id": 1, "name": "spain", "population": 47 }, ]
def test_step_row_subset_unique_with_name(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", value=1), steps.row_subset(subset="unique", field_name="id"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == []
def test_step_table_diff_with_ignore_order(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.table_normalize(), steps.table_diff( resource=Resource(data=[ ["name", "id", "population"], ["germany", 1, 83], ["france", 2, 50], ["spain", 3, 47], ]), ignore_order=True, ), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_subset_distinct_with_duplicates(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.field_update(name="id", value=1), steps.row_subset(subset="distinct", field_name="id"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, ]
def test_step_row_subset_duplicates(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_subset(subset="duplicates"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == []
def test_step_row_subset_conflicts_from_descriptor_issue_996(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_subset({ "subset": "conflicts", "fieldName": "id" }), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == []
def test_step_row_slice_with_start_and_step(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_slice(start=1, stop=3, step=2), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_transform_package(): source = describe("data/chunk*.csv") target = transform( source, steps=[ steps.resource_transform( name="chunk1", steps=[ steps.table_merge(resource="chunk2"), ], ), steps.resource_remove(name="chunk2"), ], ) assert target.resource_names == ["chunk1"] assert target.get_resource("chunk1").read_rows() == [ { "id": 1, "name": "english" }, { "id": 2, "name": "中国人" }, ]
def test_step_row_ungroup_max(): source = Resource(path="data/transform-groups.csv") source.infer() target = transform( source, steps=[ steps.row_ungroup(group_name="name", selection="max", value_name="population"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 3, "name": "france", "population": 66, "year": 2020 }, { "id": 1, "name": "germany", "population": 83, "year": 2020 }, { "id": 5, "name": "spain", "population": 47, "year": 2020 }, ]
def test_step_table_intersect_with_use_hash(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.table_normalize(), steps.table_intersect( resource=Resource(data=[ ["id", "name", "population"], [1, "germany", 83], [2, "france", 50], [3, "spain", 47], ]), use_hash=True, ), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_filter_petl_selectrangeopen(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(predicat="<formula>1 <= id <= 3"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_table_aggregate(): source = Resource(path="data/transform-groups.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_aggregate(group_name="name", aggregation={"sum": ("population", sum)}), ], ) assert target.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "sum" }, ] } assert target.read_rows() == [ { "name": "france", "sum": 120 }, { "name": "germany", "sum": 160 }, { "name": "spain", "sum": 80 }, ]
def test_step_row_sort_with_reverse(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.row_sort(field_names=["id"], reverse=True), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 3, "name": "spain", "population": 47 }, { "id": 2, "name": "france", "population": 66 }, { "id": 1, "name": "germany", "population": 83 }, ]
def test_step_row_search(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_search(regex=r"^f.*"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_subset_unique(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.row_subset(subset="unique"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_field_filter(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_filter(names=["id", "name"]), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany" }, { "id": 2, "name": "france" }, { "id": 3, "name": "spain" }, ]
def test_step_row_filter_petl_selectisfalse(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.row_filter(function=lambda row: not bool(row["id"])), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == []
def test_step_field_remove(): source = Resource(path="data/transform.csv") target = transform( source, steps=[ steps.field_remove(names=["id"]), ], ) assert target.schema == { "fields": [ { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "name": "germany", "population": 83 }, { "name": "france", "population": 66 }, { "name": "spain", "population": 47 }, ]
def test_step_row_filter_petl_selectop(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.row_filter(formula="id == 1"), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, ]
def test_step_table_join_mode_outer(): source = Resource("data/transform.csv") target = transform( source, steps=[ steps.table_normalize(), steps.table_join( resource=Resource( data=[["id", "note"], [1, "beer"], [4, "rum"]]), field_name="id", mode="outer", ), ], ) assert target.schema == { "fields": [ { "name": "id", "type": "integer" }, { "name": "name", "type": "string" }, { "name": "population", "type": "integer" }, { "name": "note", "type": "string" }, ] } assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83, "note": "beer" }, { "id": 2, "name": "france", "population": 66, "note": None }, { "id": 3, "name": "spain", "population": 47, "note": None }, { "id": 4, "name": None, "population": None, "note": "rum" }, ]
def test_step_table_recast(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_normalize(), steps.table_melt(field_name="id"), steps.table_recast(field_name="id"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]
def test_step_row_ungroup_last(): source = Resource(path="data/transform-groups.csv") source.infer() target = transform( source, steps=[ steps.row_ungroup(group_name="name", selection="last"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 4, "name": "france", "population": 54, "year": 1920 }, { "id": 2, "name": "germany", "population": 77, "year": 1920 }, { "id": 6, "name": "spain", "population": 33, "year": 1920 }, ]
def test_step_table_diff_with_ignore_order(): source = Resource(path="data/transform.csv") source.infer() target = transform( source, steps=[ steps.table_diff( resource=Resource(data=[ ["name", "id", "population"], ["germany", "1", "83"], ["france", "2", "50"], ["spain", "3", "47"], ]), ignore_order=True, ), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 2, "name": "france", "population": 66 }, ]
def test_step_row_subset_distinct(): source = Resource(path="data/transform.csv") source.infer(only_sample=True) target = transform( source, steps=[ steps.row_subset(subset="distinct", field_name="id"), ], ) assert target.schema == source.schema assert target.read_rows() == [ { "id": 1, "name": "germany", "population": 83 }, { "id": 2, "name": "france", "population": 66 }, { "id": 3, "name": "spain", "population": 47 }, ]