def _get_upstream_projects(project: Project) -> List[Project]: """ get projects immediately upstream of a given project Args: project: the project to check Returns: A list of project names upstream of the project """ client = project.client # find upstream datasets - if GR project just get input datasets if ProjectType[project.type] == ProjectType.GOLDEN_RECORDS: upstream_datasets = [x for x in project.input_datasets().stream()] # else find the upstream datasets of the UD (not input datasets to capture datasets used in Tx) else: unified_dataset_id = project.unified_dataset().relative_id unified_dataset = client.datasets.by_relative_id(unified_dataset_id) upstream_datasets = unified_dataset.upstream_datasets() upstream_project_names = [] # walk through upstream datasets for upstream_result in upstream_datasets: # get the upstream object as a dataset upstream_dataset = client.datasets.by_resource_id( upstream_result.resource_id) # see if it is the output of a project and if so add to the list upstream_dataset_projects = set( x.project_name for x in upstream_dataset.usage().usage.output_from_project_steps) upstream_project_names.extend([x for x in upstream_dataset_projects]) return [client.projects.by_name(x) for x in upstream_project_names]
def test_update_project(self): def create_callback(request, snoop): snoop["payload"] = request.body return 200, {}, json.dumps(self._updated_project_json) project_url = "http://localhost:9100/api/versioned/v1/projects/1" snoop_dict = {} responses.add_callback(responses.PUT, project_url, partial(create_callback, snoop=snoop_dict)) project = Project(self.tamr, self.project_json[0]) temp_spec = project.spec().with_name( self._updated_project_json["name"]) new_project = (temp_spec.with_description( self._updated_project_json["description"]).with_external_id( self._updated_project_json["externalId"]).put()) self.assertEqual(new_project.name, self._updated_project_json["name"]) self.assertEqual(new_project.description, self._updated_project_json["description"]) self.assertEqual(new_project.external_id, self._updated_project_json["externalId"]) self.assertEqual(json.loads(snoop_dict["payload"]), self._updated_project_json) self.assertEqual(project.name, self.project_json[0]["name"]) self.assertEqual(project.description, self.project_json[0]["description"]) self.assertEqual(project.external_id, self.project_json[0]["externalId"]) # test that intermediate didn't change self.assertEqual(temp_spec.to_dict()["description"], self.project_json[0]["description"])
def test_delete_published_clusters_configuration(self): path = "projects/1/publishedClustersConfiguration" config_url = f"{self._base_url}/{path}" responses.add(responses.GET, config_url, json=self._config_json) responses.add(responses.DELETE, config_url, status=405) p = Project(self.tamr, self._project_config_json).as_mastering() config = p.published_clusters_configuration() self.assertRaises(HTTPError, config.delete)
def test_published_clusters_configuration(self): path = "projects/1/publishedClustersConfiguration" config_url = f"{self._base_url}/{path}" responses.add(responses.GET, config_url, json=self._config_json) p = Project(self.tamr, self._project_config_json).as_mastering() config = p.published_clusters_configuration() created = PublishedClustersConfiguration.from_json( self.tamr, self._config_json, path) self.assertEqual(repr(config), repr(created)) self.assertEqual(config.versions_time_to_live, self._config_json["versionsTimeToLive"])
def test_delete(self): url = "http://localhost:9100/api/versioned/v1/projects/1/taxonomy" responses.add(responses.GET, url, json=self._taxonomy_json) responses.add(responses.DELETE, url, status=204) responses.add(responses.GET, url, status=404) project = Project(self.tamr, { "type": "CATEGORIZATION" }, "projects/1").as_categorization() taxonomy = project.taxonomy() self.assertEqual(taxonomy._data, self._taxonomy_json) response = taxonomy.delete() self.assertEqual(response.status_code, 204) self.assertRaises(HTTPError, project.taxonomy)
def _get_categories_at_tier(project: Project, *, tier: int) -> set: """ Extracts categories at tier from a taxonomy associated with Project Args: project: Tamr project object tier: integer specifying the tier to extract the categories; -1 will return all leaf categories Returns: set of category paths at tier, joined by '|' if multi-level taxonomy """ classification_project = project.as_categorization() taxonomy = classification_project.taxonomy() categories = taxonomy.categories() category_set = set() if tier > 0: for category in categories: if len(category.path) == tier: category_set.add("|".join(category.path)) else: # leaf nodes category_set = _create_leaf_node_set(taxonomy) return category_set
def _check_taxonomy_depth(project: Project, *, tier: int) -> None: """ Checks the maximum depth of the taxonomy associated wit Args: project: Tamr project object tier: integer specifying the tier from which to extract categories Returns: whether tier exceed the maximum taxonomy depth or not Raises: ValueError: if tier is greater than maximum taxonomy depth """ # depth check is not required for leaf nodes if tier == -1: return max_depth = 0 classification_project = project.as_categorization() taxonomy = classification_project.taxonomy() categories = taxonomy.categories() for category in categories: if len(category.path) > max_depth: max_depth = len(category.path) if max_depth < tier: invalid_tier_value_error = ( f"Invalid value for tier {tier}. Maximum depth detected is {max_depth}." ) LOGGER.error(invalid_tier_value_error) raise ValueError(invalid_tier_value_error)
def _run_custom(project: Project, *, run_update_unified_dataset=False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping project """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def test_refresh_ids(self): unified_dataset_url = f"{self._base_url}/projects/1/unifiedDataset" datasets_url = f"{self._base_url}/datasets" refresh_url = f"{self._base_url}/projects/1/allPublishedClusterIds:refresh" responses.add(responses.GET, unified_dataset_url, json=self._unified_dataset_json) responses.add(responses.GET, datasets_url, json=self._datasets_json) responses.add(responses.POST, refresh_url, json=self._operations_json) p = Project(self.tamr, self._project_config_json).as_mastering() d = p.published_cluster_ids() op = d.refresh(poll_interval_seconds=0) self.assertEqual(op.resource_id, self._operations_json["id"]) self.assertTrue(op.succeeded())
def bootstrap_dataset( project: Project, *, source_dataset: Dataset, force_add_dataset_to_project: bool = False ) -> List[AttributeMapping]: """ Bootstraps a dataset (i.e. maps all source columns to themselves) Args: source_dataset: the source dataset (a Dataset object not a string) project: the project to do the mapping ing force_add_dataset_to_project: boolean whether to add the dataset to the project if it is not already a part of it Returns: List of the AttributeMappings generated Raises: RuntimeError: if `source_dataset` is not part of the given `project`, set 'force_add_dataset_to_project' flag to True to automatically add it """ # check if dataset is in the project - python doesn't handle comparison of Dataset objects # well so check on name if source_dataset.name not in [x.name for x in project.input_datasets()]: if force_add_dataset_to_project: LOGGER.info(f"adding dataset {source_dataset.name} to project {project.name}") project.add_input_dataset(source_dataset) else: raise RuntimeError( f"dataset {source_dataset.name} not in project {project.name}!" + "Set 'force_add_dataset_to_project' flag to True to automatically add it" ) # for each attribute map it source_dataset_name = source_dataset.name completed_mappings = [] for attribute in source_dataset.attributes: attribute_name = attribute.name mapping = map_attribute( source_attribute_name=attribute_name, source_dataset_name=source_dataset_name, unified_attribute_name=attribute_name, project=project, ) completed_mappings.append(mapping) return completed_mappings
def test_project_remove_input_dataset(self): dataset_id = self.dataset_json[0]["relativeId"] responses.add(responses.GET, self.input_datasets_url, json=self.dataset_json) responses.add( responses.DELETE, f"{self.input_datasets_url}?id={dataset_id}", status=204 ) responses.add(responses.GET, self.input_datasets_url, json=[]) project = Project(self.tamr, self.project_json[0]) dataset = next(project.input_datasets().stream()) response = project.remove_input_dataset(dataset) self.assertEqual(response.status_code, 204) input_datasets = project.input_datasets() self.assertEqual(list(input_datasets), [])
def test_run_error(): client = utils.client.create(**CONFIG["toolbox_test_instance"]) project = Project.from_json(client, resource_json={ "name": "fake", "type": "NOT_REAL" }) with pytest.raises(KeyError): workflow.jobs.run([project])
def create(self, creation_spec): """ Create a Project in Tamr :param creation_spec: Project creation specification should be formatted as specified in the `Public Docs for Creating a Project <https://docs.tamr.com/reference#create-a-project>`_. :type creation_spec: dict[str, str] :returns: The created Project :rtype: :class:`~tamr_unify_client.project.resource.Project` """ data = self.client.post(self.api_path, json=creation_spec).successful().json() return Project.from_json(self.client, data)
def unmap_dataset( project: Project, *, source_dataset: Dataset, remove_dataset_from_project: bool = False, skip_if_missing: bool = False, ) -> None: """ Wholly unmaps a dataset and optionally removes it from a project. Args: source_dataset: the source dataset (Dataset object not a string) to unmap project: the project in which to unmap the dataset remove_dataset_from_project: boolean to also remove the dataset from the project skip_if_missing: boolean to skip if dataset is not in project. If set to false and dataset is not in project will raise a RuntimeError Returns: None Raises: RuntimeError: if `source_dataset` is not in `project` and `skip_if_missing` not set to True """ # check to make sure dataset is in project and log a warning if it is not if source_dataset.name not in [x.name for x in project.input_datasets()]: if skip_if_missing: LOGGER.warning( f"Dataset to unmap {source_dataset.name} not in project {project.name}! " f"However skip_if_missing flag is set so will do nothing" ) return None else: error_message = ( f"Dataset to unmap {source_dataset.name} not in project " f"{project.name} and skip_if_missing not set to True so failing! " ) LOGGER.error(error_message) raise RuntimeError(error_message) # the resource ids of attribute mappings unfortunately change when you delete one # so need to just do this until there are no mappings left for the source dataset of interest while True: mappings = [ x for x in project.attribute_mappings().stream() if x.input_dataset_name == source_dataset.name ] # if no mappings found for this dataset then break if not mappings: break for mapping in mappings: # can only delete one then have to break out of inner loop project.attribute_mappings().delete_by_resource_id(mapping.resource_id) break # optionally remove dataset from the project if remove_dataset_from_project: project.remove_input_dataset(source_dataset)
def test_create_from_spec(self): def create_callback(request, snoop): snoop["payload"] = json.loads(request.body) return 204, {}, json.dumps(self.created_json) url = ( "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations" ) snoop_dict = {} responses.add_callback(responses.POST, url, partial(create_callback, snoop=snoop_dict)) configs = Project(self.tamr, self.project_json).attribute_configurations() spec = (AttributeConfigurationSpec.new().with_attribute_name( self.create_json["attributeName"]).with_enabled_for_ml( self.create_json["enabledForMl"]).with_similarity_function( self.create_json["similarityFunction"])) create = configs.create(spec.to_dict()) self.assertEqual(create.relative_id, self.created_json["relativeId"]) self.assertEqual(snoop_dict["payload"], self.create_json)
def unmap_attribute( project: Project, *, source_attribute_name: str, source_dataset_name: str, unified_attribute_name: str, ) -> None: """ Unmaps a source attribute. Args: source_attribute_name: the name of the source attribute to unmap source_dataset_name: the name of the source dataset containing that source attribute unified_attribute_name: the unified attribute from which to unmap project: the project in which to unmap the attribute Returns: None """ LOGGER.info( f"Trying to remove mapping of source attribute {source_attribute_name} in dataset " f"{source_dataset_name} from unified attribute {unified_attribute_name}" ) # get mapping collection mapping_collection = project.attribute_mappings() # run through and get the resource id of the mapping to remove resource_id_to_remove = None for mapping in mapping_collection.stream(): # consider it match if all of source attribute, source dataset and unified attribute # are equal if ( source_attribute_name == mapping.input_attribute_name and source_dataset_name == mapping.input_dataset_name and unified_attribute_name == mapping.unified_attribute_name ): resource_id_to_remove = mapping.resource_id break # log warning if resource id wasn't found if resource_id_to_remove is None: LOGGER.warning( f"Mapping of {source_attribute_name} in dataset {source_dataset_name} to " f"unified attribute {unified_attribute_name} not found!" ) # if found remove it else: mapping_collection.delete_by_resource_id(resource_id_to_remove)
def test_get_versions(self): def create_callback(request, snoop): snoop["payload"] = request.body return 200, {}, "\n".join( json.dumps(c) for c in self._versions_json) p = Project.from_json(self.tamr, self._project_json).as_mastering() post_url = f"http://localhost:9100/api/versioned/v1/{p.api_path}/publishedClusterVersions" snoop = {} responses.add_callback(responses.POST, post_url, partial(create_callback, snoop=snoop)) clusters = list(p.published_cluster_versions(self._cluster_ids)) expected_clusters = [PublishedCluster(c) for c in self._versions_json] self.assertEqual(snoop["payload"], "\n".join([json.dumps(i) for i in self._cluster_ids])) self.assertEqual(len(clusters), len(expected_clusters)) for actual, expected in zip(clusters, expected_clusters): self.assertEqual(actual.__repr__(), expected.__repr__()) self.assertEqual(len(actual.versions), len(expected.versions))
def _run_custom(project: Project, *, run_update_unified_dataset=False, process_asynchronously: bool = False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset process_asynchronously: Whether or not to wait for the job to finish before returning - must be set to True for concurrent workflow Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping projectgit """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh( asynchronous=process_asynchronously) if not process_asynchronously: operation.enforce_success(op) completed_operations.append(op) return completed_operations
def get_tier_confidence( project: Project, *, tier: int = -1, allow_dataset_refresh: bool = False) -> data_type.JsonDict: """ Extracts tier-specific average confidence from a Tamr internal dataset `<unified dataset name>_classifications_average_confidences` in a dictionary Args: project: Tamr project object tier: integer specifying the tier to extract the average confidence; default value will return the average confidence at all leaf categories allow_dataset_refresh: if True, allows running a job to refresh dataset to make it streamable Returns: dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are average confidence of the corresponding keys Raises: RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False; TypeError: if tier is not of type int; or if the project type is not classification ValueError: if tier is less than -1 or equal to 0 """ LOGGER.info( f"Retrieving average confidence for taxonomy nodes in project {project.name} " f"(id={project.resource_id}).") # check project type is categorization try: project = project.as_categorization() except TypeError: not_categorization_error = f"Project {project.name} is not a classification project." LOGGER.error(not_categorization_error) raise TypeError(not_categorization_error) # check necessary dataset can be obtained dataset = _get_dataset_with_confidence(project) # check tier is valid if type(tier) is not int: wrong_tier_type_error = f"Tier {tier} is not an integer." LOGGER.error(wrong_tier_type_error) raise TypeError(wrong_tier_type_error) if tier < -1 or tier == 0: invalid_tier_value_error = ( f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1." ) LOGGER.error(invalid_tier_value_error) raise ValueError(invalid_tier_value_error) # check dataset can be streamed if not dataset.status().is_streamable: if allow_dataset_refresh: LOGGER.info( f"Refreshing dataset {dataset.name} to make streamable.") op = dataset.refresh() operation.enforce_success(op) else: cannot_stream_error = ( f"Dataset {dataset.name} is not streamable. " f"Refresh it first, or run with allow_dataset_refresh=True") LOGGER.error(cannot_stream_error) raise RuntimeError(cannot_stream_error) # check dataset contains necessary attributes _check_dataset_with_confidence(dataset) # check tier does not exceed maximum taxonomy depth _check_taxonomy_depth(project, tier=tier) # obtain categories at tier selected_category_set = _get_categories_at_tier(project, tier=tier) # extract average confidence tier_confidence_dict = _extract_confidence( dataset=dataset, category_set=selected_category_set) return tier_confidence_dict
def map_attribute( project: Project, *, source_attribute_name: str, source_dataset_name: str, unified_attribute_name: str, ) -> AttributeMapping: """ Maps source_attribute in source_dataset to unified_attribute in unified_dataset. If the mapping already exists it will log a warning and return the existing AttributeMapping from the project's collection. Args: source_attribute_name: Source attribute name to map source_dataset_name: Source dataset containing the source attribute unified_attribute_name: Unified attribute to which to map the source attribute project: The project in which to perform the mapping Returns: AttributeMapping that was created Raises: ValueError: if input variables `source_attribute_name` or `source_dataset_name` or `unified_attribute_name` are set to empty strings; or if the dataset `source_dataset_name` is not found on Tamr; or if `source_attribute_name` is missing from the attributes of `source_attribute_name` """ # simple validation, nothing should be empty variables = [source_attribute_name, source_dataset_name, unified_attribute_name] empty_variables = [x for x in variables if x == ""] if empty_variables: empty_variable_string = ", ".join(empty_variables) error_message = ( f"The following variables are set to empty strings and " f"need to be filled in: {empty_variable_string} !" ) LOGGER.error(error_message) raise ValueError(error_message) # also validate that the dataset exists and has this column try: source_dataset = project.client.datasets.by_name(source_dataset_name) except KeyError: error_msg = f"Dataset {source_dataset_name} not found!" LOGGER.error(error_msg) raise ValueError(error_msg) try: assert source_attribute_name in [x.name for x in source_dataset.attributes] except AssertionError: error_msg = f"Attribute {source_attribute_name} not found in {source_dataset_name}!" LOGGER.error(error_msg) raise ValueError(error_msg) # generate mapping spec mapping_spec = _get_mapping_spec_for_ud( source_attr_name=source_attribute_name, source_ds_name=source_dataset_name, unified_attr_name=unified_attribute_name, unified_ds_name=project.unified_dataset().name, ) # add the mapping to the project's collection - this is what does the actual mapping try: return project.attribute_mappings().create(mapping_spec.to_dict()) except JSONDecodeError as e: # can get a jsondecode error if the attribute is already mapped. # If it is, then log a warning and return empty mapping # if it is not already mapped break loudly m: AttributeMapping for m in project.attribute_mappings().stream(): if ( m.input_dataset_name == source_dataset_name and m.input_attribute_name == source_attribute_name and m.unified_attribute_name == unified_attribute_name ): # mapping exists, log warning and return existing mapping LOGGER.warning( f"mapping of attribute {source_attribute_name} in dataset " f"{source_dataset_name} to unified attribute {unified_attribute_name} " f"already exists! Returning existing mapping spec" ) return m # if haven't returned then throw the JSONDecodeError raise e
def test_project_get_input_datasets(self): p = Project(self.tamr, self.project_json[0]) datasets = p.input_datasets() self.assertEqual(datasets.api_path, "projects/1/inputDatasets")
def from_taxonomy( project: Project, export_file_path: Union[Path, str], *, csv_delimiter: str = ",", flatten_delimiter: str = "|", quote_character: str = '"', quoting: int = csv.QUOTE_MINIMAL, overwrite: bool = False, ) -> int: """ Export a Tamr taxonomy to a csv file. Records are streamed to disk and written according to a given buffer size. Args: project: Tamr Project object export_file_path: Path to the csv file where the dataset will be saved csv_delimiter: Delimiter of the csv file flatten_delimiter: Flatten list types to strings by concatenating with this delimiter quote_character: Character used to escape value for csv delimiter when it appears in the value. quoting: The escape strategy to use according to the Python csv writer. See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL overwrite: if True and export_file_name already exists, overwrite the file. Otherwise throw an error Returns: The total number of records written Raises: FileExistsError: if `export_file_path` exists and `overwrite` is set to False IOError: if the specified filepath does not exist or cannot be accessed RuntimeError: if the classification project is not yet associated with a taxonomy or taxonomy cannot be written to a csv file TypeError: if the project type is not classification ValueError: if `columns` and `flatten_columns` are identical values """ LOGGER.info( f"Streaming taxonomy to csv file {export_file_path} from project {project.name} " f"(project id={project.resource_id}).") try: project = project.as_categorization() except TypeError: not_categorization_error = f"Project {project.name} is not a classification project." LOGGER.error(not_categorization_error) raise TypeError(not_categorization_error) if os.path.exists(export_file_path): if not overwrite: message = ( f"CSV file {export_file_path} already exists. " f"(Set 'overwrite' flag to True if you wish to overwrite)") LOGGER.error(message) raise FileExistsError(message) else: LOGGER.warning( f"CSV file {export_file_path} already exists and will be overwritten" ) if csv_delimiter == flatten_delimiter: message = ( f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list " f"flattening delimiter '{flatten_delimiter}'") LOGGER.error(message) raise ValueError(message) try: taxonomy = project.as_categorization().taxonomy() except requests.exceptions.RequestException: no_taxonomy_error = f"Project {project.name} is not associated with any taxonomy yet." LOGGER.error(no_taxonomy_error) raise RuntimeError(no_taxonomy_error) # obtain categories and store in a list categories = taxonomy.categories() taxonomy_list = [] for category in categories: taxonomy_list.append(category.path) # sort the categories taxonomy_list.sort() # Open CSV file and use newline='' as recommended by # https://docs.python.org/3/library/csv.html#csv.writer try: f = open(export_file_path, "w", newline="", encoding="utf-8") except (FileNotFoundError, IOError, PermissionError): cannot_open_error = f"File path {export_file_path} could not be opened for writing." LOGGER.error(cannot_open_error) raise IOError(cannot_open_error) else: try: csv_writer = csv.writer( f, delimiter=csv_delimiter, quotechar=quote_character, quoting=quoting, ) csv_writer.writerows(taxonomy_list) except csv.Error as e: general_error = ( "Encountered an error while writing taxonomy categories to " f"{export_file_path}: {e}") f.close() LOGGER.error(general_error) raise RuntimeError(general_error) finally: f.close() records_written = len(taxonomy_list) LOGGER.info( f"Wrote {records_written} categories from {project.name} taxonomy (project id" f"={project.resource_id}) to {export_file_path}") return records_written