Beispiel #1
0
def test_delete_correct_rows_from_parquet_table_with_both_simple_and_composite_types(
):
    data = {
        "customer_id": [12345, 23456, 34567],
        "first_name": ["john", "jane", "matteo"],
        "last_name": ["doe", "doe", "hey"],
    }
    columns = [
        {
            "Column": "customer_id",
            "MatchIds": [12345],
            "Type": "Simple"
        },
        {
            "Columns": ["first_name", "last_name"],
            "MatchIds": [["jane", "doe"]],
            "Type": "Composite",
        },
    ]
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert res["customer_id"].values[0] == 34567
Beispiel #2
0
def test_delete_correct_rows_from_parquet_table_with_complex_composite_types():
    data = {
        "customer_id": [12345, 23456, 34567],
        "details": [
            {
                "first_name": "John",
                "last_name": "Doe"
            },
            {
                "first_name": "Jane",
                "last_name": "Doe"
            },
            {
                "first_name": "Matteo",
                "last_name": "Hey"
            },
        ],
    }
    columns = [{
        "Columns": ["details.first_name", "details.last_name"],
        "MatchIds": [["John", "Doe"], ["Jane", "Doe"], ["Matteo", "Doe"]],
        "Type":
        "Composite",
    }]
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert res["customer_id"].values[0] == 34567
Beispiel #3
0
def test_it_handles_data_with_pandas_indexes():
    data = [
        {
            "customer_id": "12345"
        },
        {
            "customer_id": "23456"
        },
        {
            "customer_id": "34567"
        },
    ]
    columns = [{
        "Column": "customer_id",
        "MatchIds": ["12345", "23456"],
        "Type": "Simple"
    }]
    df = pd.DataFrame(data, list("abc"))
    table = pa.Table.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert table.to_pydict() == {
        "customer_id": ["34567"],
        "__index_level_0__": ["c"]
    }
Beispiel #4
0
def test_handles_lower_cased_column_names():
    data = [
        {
            "userData": {
                "customerId": "12345"
            }
        },
        {
            "userData": {
                "customerId": "23456"
            }
        },
        {
            "userData": {
                "customerId": "34567"
            }
        },
    ]
    columns = [{
        "Column": "userdata.customerid",
        "MatchIds": ["12345", "23456"],
        "Type": "Simple",
    }]
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert table.to_pydict() == {"userData": [{"customerId": "34567"}]}
def test_delete_correct_rows_from_parquet_table_with_complex_types():
    data = {
        "customer_id": [12345, 23456, 34567],
        "user_info": [
            {
                "name": "matteo",
                "email": "*****@*****.**"
            },
            {
                "name": "nick",
                "email": "*****@*****.**"
            },
            {
                "name": "chris",
                "email": "*****@*****.**"
            },
        ],
    }
    columns = [{"Column": "user_info.name", "MatchIds": ["matteo", "chris"]}]
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    schema = pa.Schema.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns, schema)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert res["customer_id"].values[0] == 23456
    # user_info is saved unflattened preserving original schema:
    assert res["user_info"].values[0] == {
        "name": "nick",
        "email": "*****@*****.**"
    }
def test_delete_correct_rows_from_table():
    data = [
        {
            "customer_id": "12345"
        },
        {
            "customer_id": "23456"
        },
        {
            "customer_id": "34567"
        },
    ]
    columns = [{"Column": "customer_id", "MatchIds": ["12345", "23456"]}]
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    schema = pa.Schema.from_pandas(df)
    table, deleted_rows = delete_from_table(table, columns, schema)
    res = table.to_pandas()
    assert len(res) == 1
    assert deleted_rows == 2
    assert table.to_pydict() == {"customer_id": ["34567"]}