Beispiel #1
0
def _check_taxonomy_depth(project: Project, *, tier: int) -> None:
    """
    Checks the maximum depth of the taxonomy associated wit

    Args:
        project: Tamr project object
        tier: integer specifying the tier from which to extract categories

    Returns:
        whether tier exceed the maximum taxonomy depth or not

    Raises:
        ValueError: if tier is greater than maximum taxonomy depth
    """

    # depth check is not required for leaf nodes
    if tier == -1:
        return

    max_depth = 0
    classification_project = project.as_categorization()
    taxonomy = classification_project.taxonomy()
    categories = taxonomy.categories()

    for category in categories:
        if len(category.path) > max_depth:
            max_depth = len(category.path)

    if max_depth < tier:
        invalid_tier_value_error = (
            f"Invalid value for tier {tier}. Maximum depth detected is {max_depth}."
        )
        LOGGER.error(invalid_tier_value_error)
        raise ValueError(invalid_tier_value_error)
Beispiel #2
0
def _get_categories_at_tier(project: Project, *, tier: int) -> set:
    """
    Extracts categories at tier from a taxonomy associated with Project

    Args:
        project: Tamr project object
        tier: integer specifying the tier to extract the categories;
              -1 will return all leaf categories

    Returns:
        set of category paths at tier, joined by '|' if multi-level taxonomy
    """
    classification_project = project.as_categorization()
    taxonomy = classification_project.taxonomy()
    categories = taxonomy.categories()

    category_set = set()
    if tier > 0:
        for category in categories:
            if len(category.path) == tier:
                category_set.add("|".join(category.path))
    else:
        # leaf nodes
        category_set = _create_leaf_node_set(taxonomy)
    return category_set
Beispiel #3
0
def get_tier_confidence(
        project: Project,
        *,
        tier: int = -1,
        allow_dataset_refresh: bool = False) -> data_type.JsonDict:
    """
    Extracts tier-specific average confidence from a Tamr internal dataset
    `<unified dataset name>_classifications_average_confidences` in a dictionary

    Args:
        project: Tamr project object
        tier: integer specifying the tier to extract the average confidence;
              default value will return the average confidence at all leaf categories
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make it
                               streamable

    Returns:
        dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are
        average confidence of the corresponding keys

    Raises:
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False;
        TypeError: if tier is not of type int;
                   or if the project type is not classification
        ValueError: if tier is less than -1 or equal to 0
    """
    LOGGER.info(
        f"Retrieving average confidence for taxonomy nodes in project {project.name} "
        f"(id={project.resource_id}).")
    # check project type is categorization
    try:
        project = project.as_categorization()
    except TypeError:
        not_categorization_error = f"Project {project.name} is not a classification project."
        LOGGER.error(not_categorization_error)
        raise TypeError(not_categorization_error)

    # check necessary dataset can be obtained
    dataset = _get_dataset_with_confidence(project)

    # check tier is valid
    if type(tier) is not int:
        wrong_tier_type_error = f"Tier {tier} is not an integer."
        LOGGER.error(wrong_tier_type_error)
        raise TypeError(wrong_tier_type_error)
    if tier < -1 or tier == 0:
        invalid_tier_value_error = (
            f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1."
        )
        LOGGER.error(invalid_tier_value_error)
        raise ValueError(invalid_tier_value_error)

    # check dataset can be streamed
    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            cannot_stream_error = (
                f"Dataset {dataset.name} is not streamable. "
                f"Refresh it first, or run with allow_dataset_refresh=True")
            LOGGER.error(cannot_stream_error)
            raise RuntimeError(cannot_stream_error)

    # check dataset contains necessary attributes
    _check_dataset_with_confidence(dataset)

    # check tier does not exceed maximum taxonomy depth
    _check_taxonomy_depth(project, tier=tier)

    # obtain categories at tier
    selected_category_set = _get_categories_at_tier(project, tier=tier)

    # extract average confidence
    tier_confidence_dict = _extract_confidence(
        dataset=dataset, category_set=selected_category_set)
    return tier_confidence_dict
Beispiel #4
0
def from_taxonomy(
    project: Project,
    export_file_path: Union[Path, str],
    *,
    csv_delimiter: str = ",",
    flatten_delimiter: str = "|",
    quote_character: str = '"',
    quoting: int = csv.QUOTE_MINIMAL,
    overwrite: bool = False,
) -> int:
    """
    Export a Tamr taxonomy to a csv file. Records are streamed to disk and written according to a
    given buffer size.

    Args:
        project: Tamr Project object
        export_file_path: Path to the csv file where the dataset will be saved
        csv_delimiter: Delimiter of the csv file
        flatten_delimiter: Flatten list types to strings by concatenating with this delimiter
        quote_character: Character used to escape value for csv delimiter when it appears in the
            value.
        quoting: The escape strategy to use according to the Python csv writer.
            See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL
        overwrite: if True and export_file_name already exists, overwrite the file.
            Otherwise throw an error

    Returns:
        The total number of records written

    Raises:
        FileExistsError: if `export_file_path` exists and `overwrite` is set to False
        IOError: if the specified filepath does not exist or cannot be accessed
        RuntimeError: if the classification project is not yet associated with a taxonomy or
                      taxonomy cannot be written to a csv file
        TypeError: if the project type is not classification
        ValueError: if `columns` and `flatten_columns` are identical values
    """
    LOGGER.info(
        f"Streaming taxonomy to csv file {export_file_path} from project {project.name} "
        f"(project id={project.resource_id}).")

    try:
        project = project.as_categorization()
    except TypeError:
        not_categorization_error = f"Project {project.name} is not a classification project."
        LOGGER.error(not_categorization_error)
        raise TypeError(not_categorization_error)

    if os.path.exists(export_file_path):
        if not overwrite:
            message = (
                f"CSV file {export_file_path} already exists. "
                f"(Set 'overwrite' flag to True if you wish to overwrite)")
            LOGGER.error(message)
            raise FileExistsError(message)
        else:
            LOGGER.warning(
                f"CSV file {export_file_path} already exists and will be overwritten"
            )

    if csv_delimiter == flatten_delimiter:
        message = (
            f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list "
            f"flattening delimiter '{flatten_delimiter}'")
        LOGGER.error(message)
        raise ValueError(message)

    try:
        taxonomy = project.as_categorization().taxonomy()
    except requests.exceptions.RequestException:
        no_taxonomy_error = f"Project {project.name} is not associated with any taxonomy yet."
        LOGGER.error(no_taxonomy_error)
        raise RuntimeError(no_taxonomy_error)

    # obtain categories and store in a list
    categories = taxonomy.categories()
    taxonomy_list = []
    for category in categories:
        taxonomy_list.append(category.path)

    # sort the categories
    taxonomy_list.sort()

    # Open CSV file and use newline='' as recommended by
    # https://docs.python.org/3/library/csv.html#csv.writer

    try:
        f = open(export_file_path, "w", newline="", encoding="utf-8")
    except (FileNotFoundError, IOError, PermissionError):
        cannot_open_error = f"File path {export_file_path} could not be opened for writing."
        LOGGER.error(cannot_open_error)
        raise IOError(cannot_open_error)
    else:
        try:
            csv_writer = csv.writer(
                f,
                delimiter=csv_delimiter,
                quotechar=quote_character,
                quoting=quoting,
            )
            csv_writer.writerows(taxonomy_list)
        except csv.Error as e:
            general_error = (
                "Encountered an error while writing taxonomy categories to "
                f"{export_file_path}: {e}")
            f.close()
            LOGGER.error(general_error)
            raise RuntimeError(general_error)
        finally:
            f.close()

    records_written = len(taxonomy_list)
    LOGGER.info(
        f"Wrote {records_written} categories from {project.name} taxonomy (project id"
        f"={project.resource_id}) to {export_file_path}")
    return records_written