Exemple #1
0
def _get_upstream_projects(project: Project) -> List[Project]:
    """
    get projects immediately upstream of a given project
    Args:
        project: the project to check
    Returns:
        A list of project names upstream of the project
    """
    client = project.client
    # find upstream datasets - if GR project just get input datasets
    if ProjectType[project.type] == ProjectType.GOLDEN_RECORDS:
        upstream_datasets = [x for x in project.input_datasets().stream()]
    # else find the upstream datasets of the UD (not input datasets to capture datasets used in Tx)
    else:
        unified_dataset_id = project.unified_dataset().relative_id
        unified_dataset = client.datasets.by_relative_id(unified_dataset_id)
        upstream_datasets = unified_dataset.upstream_datasets()

    upstream_project_names = []
    # walk through upstream datasets
    for upstream_result in upstream_datasets:
        # get the upstream object as a dataset
        upstream_dataset = client.datasets.by_resource_id(
            upstream_result.resource_id)
        # see if it is the output of a project and if so add to the list
        upstream_dataset_projects = set(
            x.project_name
            for x in upstream_dataset.usage().usage.output_from_project_steps)
        upstream_project_names.extend([x for x in upstream_dataset_projects])

    return [client.projects.by_name(x) for x in upstream_project_names]
Exemple #2
0
def unmap_dataset(
    project: Project,
    *,
    source_dataset: Dataset,
    remove_dataset_from_project: bool = False,
    skip_if_missing: bool = False,
) -> None:
    """
    Wholly unmaps a dataset and optionally removes it from a project.

    Args:
        source_dataset: the source dataset (Dataset object not a string) to unmap
        project: the project in which to unmap the dataset
        remove_dataset_from_project: boolean to also remove the dataset from the project
        skip_if_missing: boolean to skip if dataset is not in project. If set to false and
            dataset is not in project will raise a RuntimeError

    Returns:
        None

    Raises:
        RuntimeError: if `source_dataset` is not in `project` and `skip_if_missing` not set to True
    """

    # check to make sure dataset is in project and log a warning if it is not
    if source_dataset.name not in [x.name for x in project.input_datasets()]:
        if skip_if_missing:
            LOGGER.warning(
                f"Dataset to unmap {source_dataset.name} not in project {project.name}! "
                f"However skip_if_missing flag is set so will do nothing"
            )
            return None
        else:
            error_message = (
                f"Dataset to unmap {source_dataset.name} not in project "
                f"{project.name} and skip_if_missing not set to True so failing! "
            )
            LOGGER.error(error_message)
            raise RuntimeError(error_message)

    # the resource ids of attribute mappings unfortunately change when you delete one
    # so need to just do this until there are no mappings left for the source dataset of interest
    while True:
        mappings = [
            x
            for x in project.attribute_mappings().stream()
            if x.input_dataset_name == source_dataset.name
        ]
        # if no mappings found for this dataset then break
        if not mappings:
            break
        for mapping in mappings:
            # can only delete one then have to break out of inner loop
            project.attribute_mappings().delete_by_resource_id(mapping.resource_id)
            break

    # optionally remove dataset from the project
    if remove_dataset_from_project:
        project.remove_input_dataset(source_dataset)
Exemple #3
0
    def test_project_remove_input_dataset(self):
        dataset_id = self.dataset_json[0]["relativeId"]

        responses.add(responses.GET, self.input_datasets_url, json=self.dataset_json)
        responses.add(
            responses.DELETE, f"{self.input_datasets_url}?id={dataset_id}", status=204
        )
        responses.add(responses.GET, self.input_datasets_url, json=[])

        project = Project(self.tamr, self.project_json[0])
        dataset = next(project.input_datasets().stream())

        response = project.remove_input_dataset(dataset)
        self.assertEqual(response.status_code, 204)

        input_datasets = project.input_datasets()
        self.assertEqual(list(input_datasets), [])
Exemple #4
0
def bootstrap_dataset(
    project: Project, *, source_dataset: Dataset, force_add_dataset_to_project: bool = False
) -> List[AttributeMapping]:
    """
    Bootstraps a dataset (i.e. maps all source columns to themselves)

    Args:
        source_dataset: the source dataset (a Dataset object not a string)
        project: the project to do the mapping ing
        force_add_dataset_to_project: boolean whether to add the dataset to the project
            if it is not already a part of it

    Returns:
        List of the AttributeMappings generated

    Raises:
        RuntimeError: if `source_dataset` is not part of the given `project`, set
            'force_add_dataset_to_project' flag to True to automatically add it
    """

    # check if dataset is in the project - python doesn't handle comparison of Dataset objects
    # well so check on name
    if source_dataset.name not in [x.name for x in project.input_datasets()]:
        if force_add_dataset_to_project:
            LOGGER.info(f"adding dataset {source_dataset.name} to project {project.name}")
            project.add_input_dataset(source_dataset)
        else:
            raise RuntimeError(
                f"dataset {source_dataset.name} not in project {project.name}!"
                + "Set 'force_add_dataset_to_project' flag to True to automatically add it"
            )

    # for each attribute map it
    source_dataset_name = source_dataset.name
    completed_mappings = []
    for attribute in source_dataset.attributes:
        attribute_name = attribute.name
        mapping = map_attribute(
            source_attribute_name=attribute_name,
            source_dataset_name=source_dataset_name,
            unified_attribute_name=attribute_name,
            project=project,
        )
        completed_mappings.append(mapping)

    return completed_mappings
Exemple #5
0
 def test_project_get_input_datasets(self):
     p = Project(self.tamr, self.project_json[0])
     datasets = p.input_datasets()
     self.assertEqual(datasets.api_path, "projects/1/inputDatasets")