def test_get_tier_confidence_unstreamable(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(dataset) dataset.upsert_records(df_input.head(1).to_dict(orient="records"), primary_key_name=dataset.key_attribute_names[0]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) with pytest.raises(RuntimeError): metrics.get_tier_confidence(project) # revert to the original state all_ops = categorization.jobs.run(project) for op in all_ops: assert op.succeeded() leaf_confidence_dict = metrics.get_tier_confidence(project, tier=-1) assert len(leaf_confidence_dict) == 6 assert math.isclose(leaf_confidence_dict["Dairy|Cheese"], 0.77, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Dairy|Milk"], 0.92, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Beef"], 0.81, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Chicken|bone-in"], 0.53, rel_tol=0.01) assert leaf_confidence_dict["Meat|Chicken|boneless"] is None assert leaf_confidence_dict["Vegetables"] is None
def test_get_tier_confidence_tier3(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) tier3_confidence_dict = metrics.get_tier_confidence(project, tier=3) assert len(tier3_confidence_dict) == 2 assert math.isclose(tier3_confidence_dict["Meat|Chicken|bone-in"], 0.53, rel_tol=0.01) assert tier3_confidence_dict["Meat|Chicken|boneless"] is None
def test_get_tier_confidence_tier1(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) tier1_confidence_dict = metrics.get_tier_confidence(project, tier=1) assert len(tier1_confidence_dict) == 3 assert math.isclose(tier1_confidence_dict["Dairy"], 0.81, rel_tol=0.01) assert math.isclose(tier1_confidence_dict["Meat"], 0.64, rel_tol=0.01) assert tier1_confidence_dict["Vegetables"] is None
def test_get_tier_confidence_leaf(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) leaf_confidence_dict = metrics.get_tier_confidence(project, tier=-1) assert len(leaf_confidence_dict) == 6 assert math.isclose(leaf_confidence_dict["Dairy|Cheese"], 0.77, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Dairy|Milk"], 0.92, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Beef"], 0.81, rel_tol=0.01) assert math.isclose(leaf_confidence_dict["Meat|Chicken|bone-in"], 0.53, rel_tol=0.01) assert leaf_confidence_dict["Meat|Chicken|boneless"] is None assert leaf_confidence_dict["Vegetables"] is None
def test_get_tier_confidence_refresh(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) dataset = client.datasets.by_resource_id(INPUT_DATASET_ID) df_input = dataframe.from_dataset(dataset) dataset.upsert_records(df_input.head(1).to_dict(orient="records"), primary_key_name=dataset.key_attribute_names[0]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) tier1_confidence_dict = metrics.get_tier_confidence( project, tier=1, allow_dataset_refresh=True) assert len(tier1_confidence_dict) == 3 assert math.isclose(tier1_confidence_dict["Dairy"], 0.81, rel_tol=0.01) assert math.isclose(tier1_confidence_dict["Meat"], 0.64, rel_tol=0.01) assert tier1_confidence_dict["Vegetables"] is None
def test_get_tier_confidence_invalid_tier_less_than_negative_one(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) with pytest.raises(ValueError): metrics.get_tier_confidence(project, tier=-2)
def test_get_tier_confidence_invalid_tier_float(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID) with pytest.raises(TypeError): metrics.get_tier_confidence(project, tier=1.5)
def test_get_tier_confidence_invalid_project_type(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = client.projects.by_resource_id(MASTERING_PROJECT_ID) with pytest.raises(TypeError): metrics.get_tier_confidence(project)
"""Snippet for retrieving confidence metrics from a Tamr Categorization project""" import tamr_toolbox as tbox from tamr_toolbox.project.categorization.metrics import get_tier_confidence # Read config, make Tamr Client, make logger tamr = tbox.utils.client.create(username="******", password="******", host="localhost") # Get a Tamr categorization project by ID my_project = tamr.projects.by_resource_id("my_project_id") # By default gets the average confidence at leaf nodes without allowing dataset to refresh leaf_node_confidence_dict = get_tier_confidence(my_project) # Can allow the dataset to refresh if it is not streamable # NOTE THIS WILL KICK OFF A <MATERIALIZE VIEWS> JOB leaf_node_confidence_dict2 = get_tier_confidence(my_project, allow_dataset_refresh=True) # Can also set the specific tier, which starts at 1 tier1_confidence_dict = get_tier_confidence(my_project, tier=1)