def test_create_dataframe_unstreamable(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) input_dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(input_dataset) input_dataset.upsert_from_dataframe( df_input.head(1), primary_key_name=input_dataset.key_attribute_names[0], ) dataset = client.datasets.by_resource_id(UNSTREAMABLE_DATASET_ID) with pytest.raises(RuntimeError): dataframe.from_dataset(dataset)
def test_create_dataframe_refresh(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) input_dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(input_dataset) input_dataset.upsert_from_dataframe( df_input.head(1), primary_key_name=input_dataset.key_attribute_names[0], ) dataset = client.datasets.by_resource_id(UNSTREAMABLE_DATASET_ID) df = dataframe.from_dataset(dataset, allow_dataset_refresh=True) df = df.set_index("tamr_id") assert df.shape == (9, 8) assert df.loc["-1366726601913727714", "first_name"] == ["Jeff"]
def test_get_tier_confidence_unstreamable(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(dataset) dataset.upsert_records(df_input.head(1).to_dict(orient="records"), primary_key_name=dataset.key_attribute_names[0]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) with pytest.raises(RuntimeError): metrics.get_tier_confidence(project) # revert to the original state all_ops = categorization.jobs.run(project) for op in all_ops: assert op.succeeded() leaf_confidence_dict = metrics.get_tier_confidence(project, tier=-1) assert len(leaf_confidence_dict) == 6 assert math.isclose(leaf_confidence_dict["Dairy|Cheese"], 0.77, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Dairy|Milk"], 0.92, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Beef"], 0.81, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Chicken|bone-in"], 0.53, rel_tol=0.01) assert leaf_confidence_dict["Meat|Chicken|boneless"] is None assert leaf_confidence_dict["Vegetables"] is None
def test_create_dataframe_nrows(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(SM_DATASET_ID) df = dataframe.from_dataset(dataset, nrows=5) df = df.set_index("tamr_id") assert df.shape == (5, 8)
def test_create_dataframe(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(SM_DATASET_ID) df = dataframe.from_dataset(dataset) df = df.set_index("tamr_id") assert df.shape == (18, 8) assert df.loc["-8652805551987624164", "all_names"] == ["Tuck", "Tucker"] assert df.loc["-8652805551987624164", "first_name"] == ["Tucker"] assert df.loc["-8652805551987624164", "ssn"] == [""]
def test_create_dataframe_columns(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(SM_DATASET_ID) df = dataframe.from_dataset(dataset, columns=["tamr_id", "last_name", "first_name"]) df = df.set_index("tamr_id") assert df.shape == (18, 2) assert list(df.columns) == ["last_name", "first_name"] assert df.loc["-8652805551987624164", "first_name"] == ["Tucker"]
def test_create_dataframe_flattened(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(GR_DATASET_ID) df = dataframe.from_dataset(dataset, flatten_delimiter="||") df = df.set_index("persistentId") assert df.shape == (8, 9) assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "all_first_names"] == "Rob||Robert" assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "name_lengths"] == [3, 6] assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "ssn"] == "123" assert df.loc["218c3f66-b240-3b08-b688-2c8d0506f12f", "Cluster Size"] == 2
def test_get_tier_confidence_refresh(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(dataset) dataset.upsert_records(df_input.head(1).to_dict(orient="records"), primary_key_name=dataset.key_attribute_names[0]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) tier1_confidence_dict = metrics.get_tier_confidence( project, tier=1, allow_dataset_refresh=True) assert len(tier1_confidence_dict) == 3 assert math.isclose(tier1_confidence_dict["Dairy"], 0.81, rel_tol=0.01) assert math.isclose(tier1_confidence_dict["Meat"], 0.64, rel_tol=0.01) assert tier1_confidence_dict["Vegetables"] is None
def test_create_dataframe_then_flatten(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(SM_DATASET_ID) df = dataframe.from_dataset(dataset) df1 = dataframe.flatten(df, delimiter="||") df1 = df1.set_index("tamr_id") assert df1.shape == (18, 8) assert df1.loc["-8652805551987624164", "all_names"] == "Tuck||Tucker" assert df1.loc["-8652805551987624164", "first_name"] == "Tucker" assert df1.loc["-8652805551987624164", "ssn"] == "" df2 = dataframe.flatten(df, delimiter="||", columns=["first_name", "ssn"]) df2 = df2.set_index("tamr_id") assert df2.shape == (18, 8) assert df2.loc["-8652805551987624164", "all_names"] == ["Tuck", "Tucker"] assert df2.loc["-8652805551987624164", "first_name"] == "Tucker" assert df2.loc["-8652805551987624164", "ssn"] == ""
def test_create_dataframe_wrong_columns(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(SM_DATASET_ID) with pytest.raises(ValueError): dataframe.from_dataset(dataset, columns=["tamr_id", "middle_initial"])
def test_create_dataframe_force_flatten_no_delimiter(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(GR_DATASET_ID) with pytest.raises(ValueError): dataframe.from_dataset(dataset, force_flatten=True)