def test_create_dataframe_unstreamable():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    input_dataset = client.datasets.by_resource_id(INPUT_DATASET_ID)
    df_input = dataframe.from_dataset(input_dataset)
    input_dataset.upsert_from_dataframe(
        df_input.head(1),
        primary_key_name=input_dataset.key_attribute_names[0],
    )
    dataset = client.datasets.by_resource_id(UNSTREAMABLE_DATASET_ID)
    with pytest.raises(RuntimeError):
        dataframe.from_dataset(dataset)
def test_create_dataframe_refresh():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    input_dataset = client.datasets.by_resource_id(INPUT_DATASET_ID)
    df_input = dataframe.from_dataset(input_dataset)
    input_dataset.upsert_from_dataframe(
        df_input.head(1),
        primary_key_name=input_dataset.key_attribute_names[0],
    )
    dataset = client.datasets.by_resource_id(UNSTREAMABLE_DATASET_ID)
    df = dataframe.from_dataset(dataset, allow_dataset_refresh=True)
    df = df.set_index("tamr_id")

    assert df.shape == (9, 8)
    assert df.loc["-1366726601913727714", "first_name"] == ["Jeff"]
Beispiel #3
0
def test_get_tier_confidence_unstreamable():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(INPUT_DATASET_ID)
    df_input = dataframe.from_dataset(dataset)
    dataset.upsert_records(df_input.head(1).to_dict(orient="records"),
                           primary_key_name=dataset.key_attribute_names[0])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)

    with pytest.raises(RuntimeError):
        metrics.get_tier_confidence(project)

    # revert to the original state
    all_ops = categorization.jobs.run(project)

    for op in all_ops:
        assert op.succeeded()

    leaf_confidence_dict = metrics.get_tier_confidence(project, tier=-1)

    assert len(leaf_confidence_dict) == 6
    assert math.isclose(leaf_confidence_dict["Dairy|Cheese"],
                        0.77,
                        rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Dairy|Milk"], 0.92, rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Meat|Beef"], 0.81, rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Meat|Chicken|bone-in"],
                        0.53,
                        rel_tol=0.01)
    assert leaf_confidence_dict["Meat|Chicken|boneless"] is None
    assert leaf_confidence_dict["Vegetables"] is None
def test_create_dataframe_nrows():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(SM_DATASET_ID)
    df = dataframe.from_dataset(dataset, nrows=5)
    df = df.set_index("tamr_id")

    assert df.shape == (5, 8)
def test_create_dataframe():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(SM_DATASET_ID)
    df = dataframe.from_dataset(dataset)
    df = df.set_index("tamr_id")

    assert df.shape == (18, 8)
    assert df.loc["-8652805551987624164", "all_names"] == ["Tuck", "Tucker"]
    assert df.loc["-8652805551987624164", "first_name"] == ["Tucker"]
    assert df.loc["-8652805551987624164", "ssn"] == [""]
def test_create_dataframe_columns():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(SM_DATASET_ID)
    df = dataframe.from_dataset(dataset,
                                columns=["tamr_id", "last_name", "first_name"])
    df = df.set_index("tamr_id")

    assert df.shape == (18, 2)
    assert list(df.columns) == ["last_name", "first_name"]
    assert df.loc["-8652805551987624164", "first_name"] == ["Tucker"]
def test_create_dataframe_flattened():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(GR_DATASET_ID)
    df = dataframe.from_dataset(dataset, flatten_delimiter="||")
    df = df.set_index("persistentId")

    assert df.shape == (8, 9)
    assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f",
                  "all_first_names"] == "Rob||Robert"
    assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f",
                  "name_lengths"] == [3, 6]
    assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "ssn"] == "123"
    assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "Cluster Size"] == 2
Beispiel #8
0
def test_get_tier_confidence_refresh():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(INPUT_DATASET_ID)
    df_input = dataframe.from_dataset(dataset)
    dataset.upsert_records(df_input.head(1).to_dict(orient="records"),
                           primary_key_name=dataset.key_attribute_names[0])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)
    tier1_confidence_dict = metrics.get_tier_confidence(
        project, tier=1, allow_dataset_refresh=True)

    assert len(tier1_confidence_dict) == 3
    assert math.isclose(tier1_confidence_dict["Dairy"], 0.81, rel_tol=0.01)
    assert math.isclose(tier1_confidence_dict["Meat"], 0.64, rel_tol=0.01)
    assert tier1_confidence_dict["Vegetables"] is None
def test_create_dataframe_then_flatten():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(SM_DATASET_ID)
    df = dataframe.from_dataset(dataset)
    df1 = dataframe.flatten(df, delimiter="||")
    df1 = df1.set_index("tamr_id")

    assert df1.shape == (18, 8)
    assert df1.loc["-8652805551987624164", "all_names"] == "Tuck||Tucker"
    assert df1.loc["-8652805551987624164", "first_name"] == "Tucker"
    assert df1.loc["-8652805551987624164", "ssn"] == ""

    df2 = dataframe.flatten(df, delimiter="||", columns=["first_name", "ssn"])
    df2 = df2.set_index("tamr_id")

    assert df2.shape == (18, 8)
    assert df2.loc["-8652805551987624164", "all_names"] == ["Tuck", "Tucker"]
    assert df2.loc["-8652805551987624164", "first_name"] == "Tucker"
    assert df2.loc["-8652805551987624164", "ssn"] == ""
def test_create_dataframe_wrong_columns():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(SM_DATASET_ID)
    with pytest.raises(ValueError):
        dataframe.from_dataset(dataset, columns=["tamr_id", "middle_initial"])
def test_create_dataframe_force_flatten_no_delimiter():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    dataset = client.datasets.by_resource_id(GR_DATASET_ID)
    with pytest.raises(ValueError):
        dataframe.from_dataset(dataset, force_flatten=True)