Example #1
0
def _run_custom(project: Project,
                *,
                run_update_unified_dataset=False) -> List[Operation]:
    """Executes specified steps of a schema mapping project.

    Args:
        project: Target schema mapping project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Schema Mapping project
    """
    if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS:
        error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
Example #2
0
def _run_custom(
    project: Project,
    *,
    run_update_golden_records: bool = False,
    run_publish_golden_records: bool = False,
) -> List[Operation]:
    """Executes specified steps of a golden records project.

    Args:
        project: The target golden records project
        run_update_golden_records: Whether refresh should be called on the draft golden records
            dataset
        run_publish_golden_records: Whether refresh should be called on the published golden
            records dataset

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Golden Record project
    """
    version.enforce_after_or_equal(project.client,
                                   compare_version="2020.004.0")

    if ProjectType[project.type] != ProjectType.GOLDEN_RECORDS:
        error_msg = f"Cannot use as a golden records project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)

    completed_operations = []
    if run_update_golden_records:
        LOGGER.info(
            f"Updating the draft golden records for project {project.name} "
            f"(id={project.resource_id}).")
        resp = project.client.post(
            f"/api/versioned/v1/projects/{project.resource_id}/goldenRecords:refresh"
        ).successful()
        op = Operation.from_response(client=project.client, response=resp)
        op = op.wait()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_publish_golden_records:
        LOGGER.info(
            f"Publishing golden records for project {project.name} (id={project.resource_id})."
        )
        resp = project.client.post(
            f"/api/versioned/v1/projects/{project.resource_id}/publishedGoldenRecords:refresh"
            f"?validate=true&version=CURRENT").successful()
        op = Operation.from_response(client=project.client, response=resp)
        op = op.wait()
        operation.enforce_success(op)
        completed_operations.append(op)
    return completed_operations
Example #3
0
def _run_custom(project: Project,
                *,
                run_update_unified_dataset=False,
                process_asynchronously: bool = False) -> List[Operation]:
    """Executes specified steps of a schema mapping project.

    Args:
        project: Target schema mapping project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset
        process_asynchronously: Whether or not to wait for the job to finish before returning
            - must be set to True for concurrent workflow

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Schema Mapping projectgit
    """
    if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS:
        error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh(
            asynchronous=process_asynchronously)

        if not process_asynchronously:
            operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
Example #4
0
def _run_custom(
    project: CategorizationProject,
    *,
    run_update_unified_dataset: bool = False,
    run_apply_feedback: bool = False,
    run_update_results: bool = False,
) -> List[Operation]:
    """Executes specified steps of a categorization project.

    Args:
        project: The target categorization project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset
        run_apply_feedback: Whether train should be called on the pair matching model
        run_update_results: Whether predict should be called on the pair matching model

    Returns:
        Operations that were run

    Raises:
        TypeError: if `project` is not a categorization project
    """
    if ProjectType[project.type] != ProjectType.CATEGORIZATION:
        error_msg = f"Cannot use as a categorization project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)
    else:
        project = project.as_categorization()

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_apply_feedback:
        LOGGER.info(
            f"Applying feedback to the categorization model for project {project.name} "
            f"(id={project.resource_id})."
        )
        op = project.model().train()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_update_results:
        LOGGER.info(
            f"Updating categorization results for project {project.name} "
            f"(id={project.resource_id})."
        )
        op = project.model().predict()
        operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
Example #5
0
def get_tier_confidence(
        project: Project,
        *,
        tier: int = -1,
        allow_dataset_refresh: bool = False) -> data_type.JsonDict:
    """
    Extracts tier-specific average confidence from a Tamr internal dataset
    `<unified dataset name>_classifications_average_confidences` in a dictionary

    Args:
        project: Tamr project object
        tier: integer specifying the tier to extract the average confidence;
              default value will return the average confidence at all leaf categories
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make it
                               streamable

    Returns:
        dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are
        average confidence of the corresponding keys

    Raises:
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False;
        TypeError: if tier is not of type int;
                   or if the project type is not classification
        ValueError: if tier is less than -1 or equal to 0
    """
    LOGGER.info(
        f"Retrieving average confidence for taxonomy nodes in project {project.name} "
        f"(id={project.resource_id}).")
    # check project type is categorization
    try:
        project = project.as_categorization()
    except TypeError:
        not_categorization_error = f"Project {project.name} is not a classification project."
        LOGGER.error(not_categorization_error)
        raise TypeError(not_categorization_error)

    # check necessary dataset can be obtained
    dataset = _get_dataset_with_confidence(project)

    # check tier is valid
    if type(tier) is not int:
        wrong_tier_type_error = f"Tier {tier} is not an integer."
        LOGGER.error(wrong_tier_type_error)
        raise TypeError(wrong_tier_type_error)
    if tier < -1 or tier == 0:
        invalid_tier_value_error = (
            f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1."
        )
        LOGGER.error(invalid_tier_value_error)
        raise ValueError(invalid_tier_value_error)

    # check dataset can be streamed
    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            cannot_stream_error = (
                f"Dataset {dataset.name} is not streamable. "
                f"Refresh it first, or run with allow_dataset_refresh=True")
            LOGGER.error(cannot_stream_error)
            raise RuntimeError(cannot_stream_error)

    # check dataset contains necessary attributes
    _check_dataset_with_confidence(dataset)

    # check tier does not exceed maximum taxonomy depth
    _check_taxonomy_depth(project, tier=tier)

    # obtain categories at tier
    selected_category_set = _get_categories_at_tier(project, tier=tier)

    # extract average confidence
    tier_confidence_dict = _extract_confidence(
        dataset=dataset, category_set=selected_category_set)
    return tier_confidence_dict
Example #6
0
def from_dataset(
    dataset: Dataset,
    *,
    columns: Optional[List[str]] = None,
    flatten_delimiter: Optional[str] = None,
    flatten_columns: Optional[List[str]] = None,
    force_flatten: bool = False,
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
) -> "pandas.DataFrame":
    """
    Creates a DataFrame from a Tamr Dataset

    Args:
        dataset: Tamr Dataset object
        columns: optional, ordered list of columns to keep
        flatten_delimiter: if set, flatten list types to strings by concatenating with this
            delimiter
        flatten_columns: optional, list of columns to flatten
        force_flatten:  if False, arrays with inner types other than string will not be flattened.
            if True, will force all inner types to strings when flattening values.
        nrows: number of rows to read. default None will read all rows
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable

    Returns:
        DataFrame

    Raises:
        ValueError: if `columns` or `flatten_columns` contain columns that are not present in
            `dataset`
    """
    # This function requires pandas, an optional dependency
    import pandas

    LOGGER.info(
        f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})."
    )
    dataset_attrs = [attr for attr in dataset.attributes]
    attr_names = [attr.name for attr in dataset_attrs]
    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(
            input_list=columns, reference_list=attr_names, raise_error=True
        )
    # checks on columns to flatten
    if flatten_delimiter is not None:
        if flatten_columns is None:
            flatten_columns = list(attr_names)
        else:
            # check that specified columns exist
            common._check_columns_subset(
                input_list=flatten_columns, reference_list=attr_names, raise_error=True
            )
        # check types of flatten_columns
        for attr in dataset_attrs:
            if attr.name not in flatten_columns:
                continue
            attr_type = attr.spec().to_dict()["type"]
            if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING":
                if force_flatten:
                    LOGGER.info(
                        f"Will force attribute to string: {attr.name}, with type: {attr_type}"
                    )
                else:
                    LOGGER.warning(
                        f"Will not flatten attribute: {attr.name}, with type: {attr_type}"
                    )
                    flatten_columns.remove(attr.name)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or run"
                f" with allow_dataset_refresh=True"
            )
            LOGGER.error(message)
            raise RuntimeError(message)

    # if flattening, set the function to apply to records as _flatten_list
    # otherwise set as _identity
    func = None
    if flatten_delimiter is not None:
        func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten)
    df = pandas.DataFrame.from_records(
        common._yield_records(
            dataset, func=func, columns=columns, flatten_columns=flatten_columns
        ),
        columns=columns,
        nrows=nrows,
    )
    return df
Example #7
0
def _run_custom(
    project: MasteringProject,
    *,
    run_update_unified_dataset: bool = False,
    run_estimate_pair_counts: bool = False,
    run_generate_pairs: bool = False,
    run_apply_feedback: bool = False,
    run_update_pair_results: bool = False,
    run_update_high_impact_pairs: bool = False,
    run_update_cluster_results: bool = False,
    run_publish_clusters: bool = False,
) -> List[Operation]:
    """Executes specified steps of a mastering project.

    Args:
        project: The target mastering project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset
        run_estimate_pair_counts: Whether an estimate pairs job should be run
        run_generate_pairs: Whether refresh should be called on the pairs dataset
        run_apply_feedback: Whether train should be called on the pair matching model
        run_update_pair_results: Whether predict should be called on the pair matching model
        run_update_high_impact_pairs: Whether refresh should be called on the high impact pairs
            dataset
        run_update_cluster_results: Whether refresh should be called on the record clusters dataset
        run_publish_clusters: Whether refresh should be called on the published record clusters
            dataset

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Mastering project
    """
    if ProjectType[project.type] != ProjectType.DEDUP:
        error_msg = f"Cannot use as a mastering project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)
    else:
        project = project.as_mastering()

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_estimate_pair_counts:
        LOGGER.info(
            f"Estimate pair counts for project {project.name} (id={project.resource_id})."
        )
        op = project.estimate_pairs().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_generate_pairs:
        LOGGER.info(
            f"Generating pairs for project {project.name} (id={project.resource_id})."
        )
        op = project.pairs().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_apply_feedback:
        LOGGER.info(
            f"Applying feedback to the pairs model for project {project.name} "
            f"(id={project.resource_id}).")
        op = project.pair_matching_model().train()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_update_pair_results:
        LOGGER.info(
            f"Updating pair prediction results for project {project.name} "
            f"(id={project.resource_id}).")
        op = project.pair_matching_model().predict()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_update_high_impact_pairs:
        LOGGER.info(
            f"Refreshing high impact pairs for project {project.name} (id={project.resource_id})."
        )
        op = project.high_impact_pairs().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_update_cluster_results:
        LOGGER.info(
            f"Updating cluster prediction results for project {project.name} "
            f"(id={project.resource_id}).")
        op = project.record_clusters().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)
    if run_publish_clusters:
        LOGGER.info(
            f"Publishing clusters for project {project.name} (id={project.resource_id})."
        )
        op = project.published_clusters().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
Example #8
0
def from_dataset(
    dataset: Dataset,
    export_file_path: Union[Path, str],
    *,
    csv_delimiter: str = ",",
    columns: Optional[List[str]] = None,
    flatten_delimiter: str = "|",
    quote_character: str = '"',
    quoting: int = csv.QUOTE_MINIMAL,
    na_value: str = "NaN",
    nrows: Optional[int] = None,
    allow_dataset_refresh: bool = False,
    buffer_size: int = 10000,
    overwrite: bool = False,
) -> int:
    """
    Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a
    given buffer size. As a result this is more memory efficient than first reading to a
    pandas.DataFrame and writing to CSV.

    Args:
        dataset: Tamr Dataset object
        export_file_path: Path to the csv file where the dataset will be saved
        csv_delimiter: Delimiter of the csv file
        columns: Optional, Ordered list of columns to write. If None, write all columns in
            arbitrary order.
        flatten_delimiter: Flatten list types to strings by concatenating with this delimiter
        quote_character: Character used to escape value for csv delimiter when it appears in the
            value.
        quoting: The escape strategy to use according to the Python csv writer.
            See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL
        na_value: Value to write that represents empty or missing data.
            See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
            for the na_values supported by default in pandas.read_csv
        nrows: Optional, Number of rows to write. If None, then write all rows.
        allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable.
            Otherwise a RuntimeError will be thrown if the dataset is unstreamable.
        buffer_size: Number of records to store in memory before writing to disk
        overwrite: if True and export_file_name already exists, overwrite the file.
            Otherwise throw an error

    Returns:
        The total number of records written

    Raises:
        FileExistsError: if the csv file to which the dataset is to be streamed exists
            and `overwrite` is False
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False
        ValueError: if `columns` or `flatten_columns` contain columns that are not
            present in `dataset`
    """
    LOGGER.info(
        f"Streaming records to csv file {export_file_path} from dataset {dataset.name} "
        f"(id={dataset.resource_id}).")

    if os.path.exists(export_file_path):
        if not overwrite:
            message = (
                f"CSV file {export_file_path} already exists. "
                f"(Set 'overwrite' flag to True if you wish to overwrite)")
            LOGGER.error(message)
            raise FileExistsError(message)
        else:
            LOGGER.warning(
                f"CSV file {export_file_path} already exists and will be overwritten"
            )

    if csv_delimiter == flatten_delimiter:
        message = (
            f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list "
            f"flattening delimiter '{flatten_delimiter}'")
        LOGGER.error(message)
        raise ValueError(message)

    attribute_names = [attr.name for attr in dataset.attributes]

    # check that specified columns exist
    if columns is not None:
        common._check_columns_subset(input_list=columns,
                                     reference_list=attribute_names,
                                     raise_error=True)

    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            message = (
                f"Dataset {dataset.name} is not streamable. Refresh it first, or "
                f"run with allow_dataset_refresh=True")
            LOGGER.error(message)
            raise RuntimeError(message)

    func = partial(common._flatten_list,
                   delimiter=flatten_delimiter,
                   force=True)

    # Open CSV file and use newline='' as recommended by
    # https://docs.python.org/3/library/csv.html#csv.writer
    with open(export_file_path, "w", newline="") as csv_file:
        csv_writer = csv.writer(
            csv_file,
            delimiter=csv_delimiter,
            quotechar=quote_character,
            quoting=quoting,
        )
        buffer = []
        header = None
        # Set record number to -1 in case the dataset streamed has no records
        record_number = -1

        for record_number, record in enumerate(
                common._yield_records(dataset, func=func, columns=columns)):
            # Obtain and write the header information only on the first pass
            if header is None:
                header = record.keys() if columns is None else columns
                csv_writer.writerow(header)

            # Replace empty values with a specific null value
            # This also allows nulls to be treated differently from empty strings
            record = [
                na_value if record[k] is None else record[k] for k in header
            ]
            buffer.append(record)

            at_max_buffer = buffer_size is not None and (len(buffer) >=
                                                         buffer_size)
            at_max_rows = nrows is not None and record_number >= nrows - 1
            if at_max_buffer or at_max_rows:
                csv_writer.writerows(buffer)
                LOGGER.debug(
                    f"Written dataset {dataset.name} up to record {record_number+1}"
                )
                buffer = []
                if at_max_rows:
                    break

        # Write anything remaining
        # This will occur whenever the buffer is non-zero and the number of records
        # is not exactly divisible by the buffer number
        # For example, writing a dataset with 1100 records using a buffer size of 500
        # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here
        if len(buffer) != 0:
            LOGGER.debug(
                f"Written dataset {dataset.name} up to record {record_number + 1}"
            )
            csv_writer.writerows(buffer)

        if record_number == -1:
            # If record number is -1 then no records were streamed, possibly because the dataset
            # has no records. We therefore want to simply save the headers
            if columns is not None:
                csv_writer.writerow(columns)
            else:
                csv_writer.writerow(attribute_names)

    records_written = record_number + 1

    LOGGER.info(
        f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) "
        f"to {export_file_path}")

    return records_written