def _run_custom(project: Project, *, run_update_unified_dataset=False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping project """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def _run_custom( project: Project, *, run_update_golden_records: bool = False, run_publish_golden_records: bool = False, ) -> List[Operation]: """Executes specified steps of a golden records project. Args: project: The target golden records project run_update_golden_records: Whether refresh should be called on the draft golden records dataset run_publish_golden_records: Whether refresh should be called on the published golden records dataset Returns: The operations that were run Raises: TypeError: if the `project` is not a Golden Record project """ version.enforce_after_or_equal(project.client, compare_version="2020.004.0") if ProjectType[project.type] != ProjectType.GOLDEN_RECORDS: error_msg = f"Cannot use as a golden records project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_golden_records: LOGGER.info( f"Updating the draft golden records for project {project.name} " f"(id={project.resource_id}).") resp = project.client.post( f"/api/versioned/v1/projects/{project.resource_id}/goldenRecords:refresh" ).successful() op = Operation.from_response(client=project.client, response=resp) op = op.wait() operation.enforce_success(op) completed_operations.append(op) if run_publish_golden_records: LOGGER.info( f"Publishing golden records for project {project.name} (id={project.resource_id})." ) resp = project.client.post( f"/api/versioned/v1/projects/{project.resource_id}/publishedGoldenRecords:refresh" f"?validate=true&version=CURRENT").successful() op = Operation.from_response(client=project.client, response=resp) op = op.wait() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def _run_custom(project: Project, *, run_update_unified_dataset=False, process_asynchronously: bool = False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset process_asynchronously: Whether or not to wait for the job to finish before returning - must be set to True for concurrent workflow Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping projectgit """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh( asynchronous=process_asynchronously) if not process_asynchronously: operation.enforce_success(op) completed_operations.append(op) return completed_operations
def _run_custom( project: CategorizationProject, *, run_update_unified_dataset: bool = False, run_apply_feedback: bool = False, run_update_results: bool = False, ) -> List[Operation]: """Executes specified steps of a categorization project. Args: project: The target categorization project run_update_unified_dataset: Whether refresh should be called on the unified dataset run_apply_feedback: Whether train should be called on the pair matching model run_update_results: Whether predict should be called on the pair matching model Returns: Operations that were run Raises: TypeError: if `project` is not a categorization project """ if ProjectType[project.type] != ProjectType.CATEGORIZATION: error_msg = f"Cannot use as a categorization project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) else: project = project.as_categorization() completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh() operation.enforce_success(op) completed_operations.append(op) if run_apply_feedback: LOGGER.info( f"Applying feedback to the categorization model for project {project.name} " f"(id={project.resource_id})." ) op = project.model().train() operation.enforce_success(op) completed_operations.append(op) if run_update_results: LOGGER.info( f"Updating categorization results for project {project.name} " f"(id={project.resource_id})." ) op = project.model().predict() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def get_tier_confidence( project: Project, *, tier: int = -1, allow_dataset_refresh: bool = False) -> data_type.JsonDict: """ Extracts tier-specific average confidence from a Tamr internal dataset `<unified dataset name>_classifications_average_confidences` in a dictionary Args: project: Tamr project object tier: integer specifying the tier to extract the average confidence; default value will return the average confidence at all leaf categories allow_dataset_refresh: if True, allows running a job to refresh dataset to make it streamable Returns: dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are average confidence of the corresponding keys Raises: RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False; TypeError: if tier is not of type int; or if the project type is not classification ValueError: if tier is less than -1 or equal to 0 """ LOGGER.info( f"Retrieving average confidence for taxonomy nodes in project {project.name} " f"(id={project.resource_id}).") # check project type is categorization try: project = project.as_categorization() except TypeError: not_categorization_error = f"Project {project.name} is not a classification project." LOGGER.error(not_categorization_error) raise TypeError(not_categorization_error) # check necessary dataset can be obtained dataset = _get_dataset_with_confidence(project) # check tier is valid if type(tier) is not int: wrong_tier_type_error = f"Tier {tier} is not an integer." LOGGER.error(wrong_tier_type_error) raise TypeError(wrong_tier_type_error) if tier < -1 or tier == 0: invalid_tier_value_error = ( f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1." ) LOGGER.error(invalid_tier_value_error) raise ValueError(invalid_tier_value_error) # check dataset can be streamed if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: cannot_stream_error = ( f"Dataset {dataset.name} is not streamable. " f"Refresh it first, or run with allow_dataset_refresh=True") LOGGER.error(cannot_stream_error) raise RuntimeError(cannot_stream_error) # check dataset contains necessary attributes _check_dataset_with_confidence(dataset) # check tier does not exceed maximum taxonomy depth _check_taxonomy_depth(project, tier=tier) # obtain categories at tier selected_category_set = _get_categories_at_tier(project, tier=tier) # extract average confidence tier_confidence_dict = _extract_confidence( dataset=dataset, category_set=selected_category_set) return tier_confidence_dict
def from_dataset( dataset: Dataset, *, columns: Optional[List[str]] = None, flatten_delimiter: Optional[str] = None, flatten_columns: Optional[List[str]] = None, force_flatten: bool = False, nrows: Optional[int] = None, allow_dataset_refresh: bool = False, ) -> "pandas.DataFrame": """ Creates a DataFrame from a Tamr Dataset Args: dataset: Tamr Dataset object columns: optional, ordered list of columns to keep flatten_delimiter: if set, flatten list types to strings by concatenating with this delimiter flatten_columns: optional, list of columns to flatten force_flatten: if False, arrays with inner types other than string will not be flattened. if True, will force all inner types to strings when flattening values. nrows: number of rows to read. default None will read all rows allow_dataset_refresh: if True, allows running a job to refresh dataset to make streamable Returns: DataFrame Raises: ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ # This function requires pandas, an optional dependency import pandas LOGGER.info( f"Streaming records to DataFrame for dataset {dataset.name} (id={dataset.resource_id})." ) dataset_attrs = [attr for attr in dataset.attributes] attr_names = [attr.name for attr in dataset_attrs] # check that specified columns exist if columns is not None: common._check_columns_subset( input_list=columns, reference_list=attr_names, raise_error=True ) # checks on columns to flatten if flatten_delimiter is not None: if flatten_columns is None: flatten_columns = list(attr_names) else: # check that specified columns exist common._check_columns_subset( input_list=flatten_columns, reference_list=attr_names, raise_error=True ) # check types of flatten_columns for attr in dataset_attrs: if attr.name not in flatten_columns: continue attr_type = attr.spec().to_dict()["type"] if attr_type["baseType"] == "ARRAY" and attr_type["innerType"]["baseType"] != "STRING": if force_flatten: LOGGER.info( f"Will force attribute to string: {attr.name}, with type: {attr_type}" ) else: LOGGER.warning( f"Will not flatten attribute: {attr.name}, with type: {attr_type}" ) flatten_columns.remove(attr.name) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info(f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or run" f" with allow_dataset_refresh=True" ) LOGGER.error(message) raise RuntimeError(message) # if flattening, set the function to apply to records as _flatten_list # otherwise set as _identity func = None if flatten_delimiter is not None: func = partial(common._flatten_list, delimiter=flatten_delimiter, force=force_flatten) df = pandas.DataFrame.from_records( common._yield_records( dataset, func=func, columns=columns, flatten_columns=flatten_columns ), columns=columns, nrows=nrows, ) return df
def _run_custom( project: MasteringProject, *, run_update_unified_dataset: bool = False, run_estimate_pair_counts: bool = False, run_generate_pairs: bool = False, run_apply_feedback: bool = False, run_update_pair_results: bool = False, run_update_high_impact_pairs: bool = False, run_update_cluster_results: bool = False, run_publish_clusters: bool = False, ) -> List[Operation]: """Executes specified steps of a mastering project. Args: project: The target mastering project run_update_unified_dataset: Whether refresh should be called on the unified dataset run_estimate_pair_counts: Whether an estimate pairs job should be run run_generate_pairs: Whether refresh should be called on the pairs dataset run_apply_feedback: Whether train should be called on the pair matching model run_update_pair_results: Whether predict should be called on the pair matching model run_update_high_impact_pairs: Whether refresh should be called on the high impact pairs dataset run_update_cluster_results: Whether refresh should be called on the record clusters dataset run_publish_clusters: Whether refresh should be called on the published record clusters dataset Returns: The operations that were run Raises: TypeError: if the `project` is not a Mastering project """ if ProjectType[project.type] != ProjectType.DEDUP: error_msg = f"Cannot use as a mastering project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) else: project = project.as_mastering() completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh() operation.enforce_success(op) completed_operations.append(op) if run_estimate_pair_counts: LOGGER.info( f"Estimate pair counts for project {project.name} (id={project.resource_id})." ) op = project.estimate_pairs().refresh() operation.enforce_success(op) completed_operations.append(op) if run_generate_pairs: LOGGER.info( f"Generating pairs for project {project.name} (id={project.resource_id})." ) op = project.pairs().refresh() operation.enforce_success(op) completed_operations.append(op) if run_apply_feedback: LOGGER.info( f"Applying feedback to the pairs model for project {project.name} " f"(id={project.resource_id}).") op = project.pair_matching_model().train() operation.enforce_success(op) completed_operations.append(op) if run_update_pair_results: LOGGER.info( f"Updating pair prediction results for project {project.name} " f"(id={project.resource_id}).") op = project.pair_matching_model().predict() operation.enforce_success(op) completed_operations.append(op) if run_update_high_impact_pairs: LOGGER.info( f"Refreshing high impact pairs for project {project.name} (id={project.resource_id})." ) op = project.high_impact_pairs().refresh() operation.enforce_success(op) completed_operations.append(op) if run_update_cluster_results: LOGGER.info( f"Updating cluster prediction results for project {project.name} " f"(id={project.resource_id}).") op = project.record_clusters().refresh() operation.enforce_success(op) completed_operations.append(op) if run_publish_clusters: LOGGER.info( f"Publishing clusters for project {project.name} (id={project.resource_id})." ) op = project.published_clusters().refresh() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def from_dataset( dataset: Dataset, export_file_path: Union[Path, str], *, csv_delimiter: str = ",", columns: Optional[List[str]] = None, flatten_delimiter: str = "|", quote_character: str = '"', quoting: int = csv.QUOTE_MINIMAL, na_value: str = "NaN", nrows: Optional[int] = None, allow_dataset_refresh: bool = False, buffer_size: int = 10000, overwrite: bool = False, ) -> int: """ Export a Tamr Dataset to a csv file. Records are streamed to disk and written according to a given buffer size. As a result this is more memory efficient than first reading to a pandas.DataFrame and writing to CSV. Args: dataset: Tamr Dataset object export_file_path: Path to the csv file where the dataset will be saved csv_delimiter: Delimiter of the csv file columns: Optional, Ordered list of columns to write. If None, write all columns in arbitrary order. flatten_delimiter: Flatten list types to strings by concatenating with this delimiter quote_character: Character used to escape value for csv delimiter when it appears in the value. quoting: The escape strategy to use according to the Python csv writer. See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL na_value: Value to write that represents empty or missing data. See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html for the na_values supported by default in pandas.read_csv nrows: Optional, Number of rows to write. If None, then write all rows. allow_dataset_refresh: If True, allows running a job to refresh dataset to make streamable. Otherwise a RuntimeError will be thrown if the dataset is unstreamable. buffer_size: Number of records to store in memory before writing to disk overwrite: if True and export_file_name already exists, overwrite the file. Otherwise throw an error Returns: The total number of records written Raises: FileExistsError: if the csv file to which the dataset is to be streamed exists and `overwrite` is False RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False ValueError: if `columns` or `flatten_columns` contain columns that are not present in `dataset` """ LOGGER.info( f"Streaming records to csv file {export_file_path} from dataset {dataset.name} " f"(id={dataset.resource_id}).") if os.path.exists(export_file_path): if not overwrite: message = ( f"CSV file {export_file_path} already exists. " f"(Set 'overwrite' flag to True if you wish to overwrite)") LOGGER.error(message) raise FileExistsError(message) else: LOGGER.warning( f"CSV file {export_file_path} already exists and will be overwritten" ) if csv_delimiter == flatten_delimiter: message = ( f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list " f"flattening delimiter '{flatten_delimiter}'") LOGGER.error(message) raise ValueError(message) attribute_names = [attr.name for attr in dataset.attributes] # check that specified columns exist if columns is not None: common._check_columns_subset(input_list=columns, reference_list=attribute_names, raise_error=True) if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: message = ( f"Dataset {dataset.name} is not streamable. Refresh it first, or " f"run with allow_dataset_refresh=True") LOGGER.error(message) raise RuntimeError(message) func = partial(common._flatten_list, delimiter=flatten_delimiter, force=True) # Open CSV file and use newline='' as recommended by # https://docs.python.org/3/library/csv.html#csv.writer with open(export_file_path, "w", newline="") as csv_file: csv_writer = csv.writer( csv_file, delimiter=csv_delimiter, quotechar=quote_character, quoting=quoting, ) buffer = [] header = None # Set record number to -1 in case the dataset streamed has no records record_number = -1 for record_number, record in enumerate( common._yield_records(dataset, func=func, columns=columns)): # Obtain and write the header information only on the first pass if header is None: header = record.keys() if columns is None else columns csv_writer.writerow(header) # Replace empty values with a specific null value # This also allows nulls to be treated differently from empty strings record = [ na_value if record[k] is None else record[k] for k in header ] buffer.append(record) at_max_buffer = buffer_size is not None and (len(buffer) >= buffer_size) at_max_rows = nrows is not None and record_number >= nrows - 1 if at_max_buffer or at_max_rows: csv_writer.writerows(buffer) LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number+1}" ) buffer = [] if at_max_rows: break # Write anything remaining # This will occur whenever the buffer is non-zero and the number of records # is not exactly divisible by the buffer number # For example, writing a dataset with 1100 records using a buffer size of 500 # will write in 3 chunks: 2 x 500 above and the remaining 100 handled here if len(buffer) != 0: LOGGER.debug( f"Written dataset {dataset.name} up to record {record_number + 1}" ) csv_writer.writerows(buffer) if record_number == -1: # If record number is -1 then no records were streamed, possibly because the dataset # has no records. We therefore want to simply save the headers if columns is not None: csv_writer.writerow(columns) else: csv_writer.writerow(attribute_names) records_written = record_number + 1 LOGGER.info( f"Wrote {records_written} from dataset {dataset.name} (id={dataset.resource_id}) " f"to {export_file_path}") return records_written