def _get_upstream_projects(project: Project) -> List[Project]: """ get projects immediately upstream of a given project Args: project: the project to check Returns: A list of project names upstream of the project """ client = project.client # find upstream datasets - if GR project just get input datasets if ProjectType[project.type] == ProjectType.GOLDEN_RECORDS: upstream_datasets = [x for x in project.input_datasets().stream()] # else find the upstream datasets of the UD (not input datasets to capture datasets used in Tx) else: unified_dataset_id = project.unified_dataset().relative_id unified_dataset = client.datasets.by_relative_id(unified_dataset_id) upstream_datasets = unified_dataset.upstream_datasets() upstream_project_names = [] # walk through upstream datasets for upstream_result in upstream_datasets: # get the upstream object as a dataset upstream_dataset = client.datasets.by_resource_id( upstream_result.resource_id) # see if it is the output of a project and if so add to the list upstream_dataset_projects = set( x.project_name for x in upstream_dataset.usage().usage.output_from_project_steps) upstream_project_names.extend([x for x in upstream_dataset_projects]) return [client.projects.by_name(x) for x in upstream_project_names]
def _run_custom(project: Project, *, run_update_unified_dataset=False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping project """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh() operation.enforce_success(op) completed_operations.append(op) return completed_operations
def _run_custom(project: Project, *, run_update_unified_dataset=False, process_asynchronously: bool = False) -> List[Operation]: """Executes specified steps of a schema mapping project. Args: project: Target schema mapping project run_update_unified_dataset: Whether refresh should be called on the unified dataset process_asynchronously: Whether or not to wait for the job to finish before returning - must be set to True for concurrent workflow Returns: The operations that were run Raises: TypeError: if the `project` is not a Schema Mapping projectgit """ if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS: error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}" LOGGER.error(error_msg) raise TypeError(error_msg) completed_operations = [] if run_update_unified_dataset: LOGGER.info( f"Updating the unified dataset for project {project.name} (id={project.resource_id})." ) op = project.unified_dataset().refresh( asynchronous=process_asynchronously) if not process_asynchronously: operation.enforce_success(op) completed_operations.append(op) return completed_operations
def map_attribute( project: Project, *, source_attribute_name: str, source_dataset_name: str, unified_attribute_name: str, ) -> AttributeMapping: """ Maps source_attribute in source_dataset to unified_attribute in unified_dataset. If the mapping already exists it will log a warning and return the existing AttributeMapping from the project's collection. Args: source_attribute_name: Source attribute name to map source_dataset_name: Source dataset containing the source attribute unified_attribute_name: Unified attribute to which to map the source attribute project: The project in which to perform the mapping Returns: AttributeMapping that was created Raises: ValueError: if input variables `source_attribute_name` or `source_dataset_name` or `unified_attribute_name` are set to empty strings; or if the dataset `source_dataset_name` is not found on Tamr; or if `source_attribute_name` is missing from the attributes of `source_attribute_name` """ # simple validation, nothing should be empty variables = [source_attribute_name, source_dataset_name, unified_attribute_name] empty_variables = [x for x in variables if x == ""] if empty_variables: empty_variable_string = ", ".join(empty_variables) error_message = ( f"The following variables are set to empty strings and " f"need to be filled in: {empty_variable_string} !" ) LOGGER.error(error_message) raise ValueError(error_message) # also validate that the dataset exists and has this column try: source_dataset = project.client.datasets.by_name(source_dataset_name) except KeyError: error_msg = f"Dataset {source_dataset_name} not found!" LOGGER.error(error_msg) raise ValueError(error_msg) try: assert source_attribute_name in [x.name for x in source_dataset.attributes] except AssertionError: error_msg = f"Attribute {source_attribute_name} not found in {source_dataset_name}!" LOGGER.error(error_msg) raise ValueError(error_msg) # generate mapping spec mapping_spec = _get_mapping_spec_for_ud( source_attr_name=source_attribute_name, source_ds_name=source_dataset_name, unified_attr_name=unified_attribute_name, unified_ds_name=project.unified_dataset().name, ) # add the mapping to the project's collection - this is what does the actual mapping try: return project.attribute_mappings().create(mapping_spec.to_dict()) except JSONDecodeError as e: # can get a jsondecode error if the attribute is already mapped. # If it is, then log a warning and return empty mapping # if it is not already mapped break loudly m: AttributeMapping for m in project.attribute_mappings().stream(): if ( m.input_dataset_name == source_dataset_name and m.input_attribute_name == source_attribute_name and m.unified_attribute_name == unified_attribute_name ): # mapping exists, log warning and return existing mapping LOGGER.warning( f"mapping of attribute {source_attribute_name} in dataset " f"{source_dataset_name} to unified attribute {unified_attribute_name} " f"already exists! Returning existing mapping spec" ) return m # if haven't returned then throw the JSONDecodeError raise e