コード例 #1
0
def test_get_categories_at_tier_1():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)
    category_list = metrics._get_categories_at_tier(project, tier=1)

    assert len(category_list) == 3
    assert "Dairy" in category_list
    assert "Meat" in category_list
    assert "Vegetables" in category_list
コード例 #2
0
def test_get_categories_at_tier_2():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)
    category_set = metrics._get_categories_at_tier(project, tier=2)

    assert len(category_set) == 4
    assert "Dairy|Cheese" in category_set
    assert "Dairy|Milk" in category_set
    assert "Meat|Beef" in category_set
    assert "Meat|Chicken" in category_set
コード例 #3
0
def test_extract_confidence_tier1():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)
    dataset = client.datasets.by_resource_id(CATEGORIZATION_DATASET_ID)
    category_set = metrics._get_categories_at_tier(project, tier=1)
    tier1_confidence_dict = metrics._extract_confidence(dataset=dataset, category_set=category_set)

    assert len(tier1_confidence_dict) == 3
    assert math.isclose(tier1_confidence_dict["Dairy"], 0.81, rel_tol=0.01)
    assert math.isclose(tier1_confidence_dict["Meat"], 0.64, rel_tol=0.01)
    assert tier1_confidence_dict["Vegetables"] is None
コード例 #4
0
def test_extract_confidence_leaf():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    project = client.projects.by_resource_id(CATEGORIZATION_PROJECT_ID)
    dataset = client.datasets.by_resource_id(CATEGORIZATION_DATASET_ID)
    category_set = metrics._get_categories_at_tier(project, tier=-1)
    leaf_confidence_dict = metrics._extract_confidence(dataset=dataset, category_set=category_set)

    assert len(leaf_confidence_dict) == 6
    assert math.isclose(leaf_confidence_dict["Dairy|Cheese"], 0.77, rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Dairy|Milk"], 0.92, rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Meat|Beef"], 0.81, rel_tol=0.01)
    assert math.isclose(leaf_confidence_dict["Meat|Chicken|bone-in"], 0.53, rel_tol=0.01)
    assert leaf_confidence_dict["Meat|Chicken|boneless"] is None
    assert leaf_confidence_dict["Vegetables"] is None
コード例 #5
0
def main(
    *,
    instance_connection_info: Dict[str, Any],
    categorization_project_id: str,
    unified_attribute_name: str,
    category_tier: Optional[int] = None,
) -> None:
    """Bootstraps the model for a categorization projcets by adding the taxonomy as a separate
    source with training labels

    Args:
        instance_connection_info: Information for connecting to Tamr (host, port, username etc)
        categorization_project_id: The id of the target categorization project
        unified_attribute_name: The unified attribute to map the category names onto
        category_tier: Which tier of the taxonomy to confine labels to. Use -1 for leaf nodes.
            If not passed, all categories at all tiers will be used.

    Returns:
        Boolean indicating whether boostrap was successful or not

    Raises:
        TypeError: retrieved project is not a categorization project
        ValueError: retrieved project does not have an attribute of the specified name
    """

    # Create the tamr client
    tamr_client = tbox.utils.client.create(**instance_connection_info)

    # Retrieve the project
    project = tamr_client.projects.by_resource_id(
        categorization_project_id).as_categorization()
    LOGGER.info(f"Retrieved project with name: {project.name}")

    # Validate dataset and attribute names
    # Confirm the target unified attribute exists
    try:
        project.attributes.by_name(unified_attribute_name)
    except requests.exceptions.HTTPError:
        raise RuntimeError(
            f"Project {project.name} has no attribute {unified_attribute_name}."
        )

    # Create a dataset with taxonomy categories
    dataset_name = f"{project.unified_dataset().name}_taxonomy_bootstrap_dataset"
    try:
        project.client.datasets.by_name(dataset_name)
    except KeyError:
        # Dataset with `dataset_name` does not exist in Tamr. Proceed with dataset creation.
        pass
    else:
        dataset_exists_error = (
            f"A dataset with name {dataset_name} already exists. Try again after deleting the "
            "dataset.")
        LOGGER.error(dataset_exists_error)
        raise RuntimeError(dataset_exists_error)

    # Proceed with dataset creation
    # Get the project taxonomy
    try:
        project.taxonomy()
    except requests.exceptions.RequestException:
        raise RuntimeError(
            f"Project {project.name} is not associated with any taxonomy yet.")
    LOGGER.info(
        f"Retrieved project taxonomy with name: {project.taxonomy().name}")

    # Bootstrap all available categories
    categories = project.taxonomy().categories()
    if category_tier is None:
        category_list = [category.path for category in categories]
    else:
        category_set = metrics._get_categories_at_tier(project=project,
                                                       tier=category_tier)
        category_list = [category.split("|") for category in category_set]
        category_list.sort()

    # Create a dictionary of full path as a string to the leaf node name (used as label path)
    taxonomy_dict = {
        ", ".join(category): category[-1]
        for category in category_list
    }

    # Create a dataframe
    df = pd.DataFrame(list(taxonomy_dict.items()),
                      columns=["Category Path", "Category Name"])

    # Create a dataset in Tamr
    taxonomy_dataset = project.client.datasets.create_from_dataframe(
        df, primary_key_name="Category Path", dataset_name=dataset_name)
    LOGGER.info(
        f"Created a dataset in Tamr with name: {taxonomy_dataset.name}")

    # Add the dataset into the project
    project.add_input_dataset(taxonomy_dataset)
    LOGGER.info(f"Added {taxonomy_dataset.name} to project {project.name}")

    # Map category name attribute to new unified attribute
    attr_mapping_spec = (
        AttributeMappingSpec.new().with_input_dataset_name(dataset_name).
        with_input_attribute_name("Category Name").with_unified_dataset_name(
            project.unified_dataset().name).with_unified_attribute_name(
                unified_attribute_name))
    project.attribute_mappings().create(attr_mapping_spec.to_dict())
    LOGGER.info(
        f"Created mapping from source attribute 'Category Name' to unified attribute "
        f"{unified_attribute_name}")

    # Create transformation ensuring dataset tamr_id values match categorization path
    all_tx = tbox.project.schema_mapping.transformations.get_all(project)
    new_tx = (
        f"SELECT *, CASE WHEN origin_source_name = '{dataset_name}' THEN "
        f"concat(origin_source_name, '_', origin_entity_id) ELSE tamr_id END AS tamr_id;"
    )
    # Append so that it is applied after any other possibly conflicting transformations
    all_tx.unified_scope.append(new_tx)
    tbox.project.schema_mapping.transformations.set_all(project, all_tx)

    LOGGER.info("Updating the unified dataset...")
    tbox.project.categorization.jobs.update_unified_dataset(project)

    # Prepare and post labels
    labels_to_bootstrap = [{
        "action": "CREATE",
        "recordId": f"{dataset_name}_{key}",
        "record": {
            "verified": {
                "category": {
                    "path": path
                },
                "reason": "Taxonomy bootstrap"
            }
        },
    } for key, path in taxonomy_dict.items()]
    project.client.post(
        f"projects/{project.resource_id}/categorizations/labels:updateRecords",
        json=labels_to_bootstrap,
    ).successful()
    LOGGER.info(f"Created and inserted labels into {project.name}")

    # Apply feedback and update results
    tbox.project.categorization.jobs.apply_feedback_and_update_results(project)
    LOGGER.info("Successfully applied and updated the model")
    LOGGER.info(f"Completed bootstrapping taxonomy in project {project.name}")