コード例 #1
0
def _get_upstream_projects(project: Project) -> List[Project]:
    """
    get projects immediately upstream of a given project
    Args:
        project: the project to check
    Returns:
        A list of project names upstream of the project
    """
    client = project.client
    # find upstream datasets - if GR project just get input datasets
    if ProjectType[project.type] == ProjectType.GOLDEN_RECORDS:
        upstream_datasets = [x for x in project.input_datasets().stream()]
    # else find the upstream datasets of the UD (not input datasets to capture datasets used in Tx)
    else:
        unified_dataset_id = project.unified_dataset().relative_id
        unified_dataset = client.datasets.by_relative_id(unified_dataset_id)
        upstream_datasets = unified_dataset.upstream_datasets()

    upstream_project_names = []
    # walk through upstream datasets
    for upstream_result in upstream_datasets:
        # get the upstream object as a dataset
        upstream_dataset = client.datasets.by_resource_id(
            upstream_result.resource_id)
        # see if it is the output of a project and if so add to the list
        upstream_dataset_projects = set(
            x.project_name
            for x in upstream_dataset.usage().usage.output_from_project_steps)
        upstream_project_names.extend([x for x in upstream_dataset_projects])

    return [client.projects.by_name(x) for x in upstream_project_names]
コード例 #2
0
ファイル: test_project.py プロジェクト: minroh/tamr-client
    def test_update_project(self):
        def create_callback(request, snoop):
            snoop["payload"] = request.body
            return 200, {}, json.dumps(self._updated_project_json)

        project_url = "http://localhost:9100/api/versioned/v1/projects/1"
        snoop_dict = {}
        responses.add_callback(responses.PUT, project_url,
                               partial(create_callback, snoop=snoop_dict))
        project = Project(self.tamr, self.project_json[0])

        temp_spec = project.spec().with_name(
            self._updated_project_json["name"])
        new_project = (temp_spec.with_description(
            self._updated_project_json["description"]).with_external_id(
                self._updated_project_json["externalId"]).put())
        self.assertEqual(new_project.name, self._updated_project_json["name"])
        self.assertEqual(new_project.description,
                         self._updated_project_json["description"])
        self.assertEqual(new_project.external_id,
                         self._updated_project_json["externalId"])

        self.assertEqual(json.loads(snoop_dict["payload"]),
                         self._updated_project_json)

        self.assertEqual(project.name, self.project_json[0]["name"])
        self.assertEqual(project.description,
                         self.project_json[0]["description"])
        self.assertEqual(project.external_id,
                         self.project_json[0]["externalId"])

        # test that intermediate didn't change
        self.assertEqual(temp_spec.to_dict()["description"],
                         self.project_json[0]["description"])
コード例 #3
0
    def test_delete_published_clusters_configuration(self):
        path = "projects/1/publishedClustersConfiguration"
        config_url = f"{self._base_url}/{path}"
        responses.add(responses.GET, config_url, json=self._config_json)
        responses.add(responses.DELETE, config_url, status=405)

        p = Project(self.tamr, self._project_config_json).as_mastering()
        config = p.published_clusters_configuration()
        self.assertRaises(HTTPError, config.delete)
コード例 #4
0
    def test_published_clusters_configuration(self):
        path = "projects/1/publishedClustersConfiguration"
        config_url = f"{self._base_url}/{path}"
        responses.add(responses.GET, config_url, json=self._config_json)

        p = Project(self.tamr, self._project_config_json).as_mastering()
        config = p.published_clusters_configuration()
        created = PublishedClustersConfiguration.from_json(
            self.tamr, self._config_json, path)

        self.assertEqual(repr(config), repr(created))
        self.assertEqual(config.versions_time_to_live,
                         self._config_json["versionsTimeToLive"])
コード例 #5
0
ファイル: test_taxonomy.py プロジェクト: skalish/tamr-client
    def test_delete(self):
        url = "http://localhost:9100/api/versioned/v1/projects/1/taxonomy"
        responses.add(responses.GET, url, json=self._taxonomy_json)
        responses.add(responses.DELETE, url, status=204)
        responses.add(responses.GET, url, status=404)

        project = Project(self.tamr, {
            "type": "CATEGORIZATION"
        }, "projects/1").as_categorization()
        taxonomy = project.taxonomy()
        self.assertEqual(taxonomy._data, self._taxonomy_json)

        response = taxonomy.delete()
        self.assertEqual(response.status_code, 204)
        self.assertRaises(HTTPError, project.taxonomy)
コード例 #6
0
ファイル: metrics.py プロジェクト: Datatamer/tamr-toolbox
def _get_categories_at_tier(project: Project, *, tier: int) -> set:
    """
    Extracts categories at tier from a taxonomy associated with Project

    Args:
        project: Tamr project object
        tier: integer specifying the tier to extract the categories;
              -1 will return all leaf categories

    Returns:
        set of category paths at tier, joined by '|' if multi-level taxonomy
    """
    classification_project = project.as_categorization()
    taxonomy = classification_project.taxonomy()
    categories = taxonomy.categories()

    category_set = set()
    if tier > 0:
        for category in categories:
            if len(category.path) == tier:
                category_set.add("|".join(category.path))
    else:
        # leaf nodes
        category_set = _create_leaf_node_set(taxonomy)
    return category_set
コード例 #7
0
ファイル: metrics.py プロジェクト: Datatamer/tamr-toolbox
def _check_taxonomy_depth(project: Project, *, tier: int) -> None:
    """
    Checks the maximum depth of the taxonomy associated wit

    Args:
        project: Tamr project object
        tier: integer specifying the tier from which to extract categories

    Returns:
        whether tier exceed the maximum taxonomy depth or not

    Raises:
        ValueError: if tier is greater than maximum taxonomy depth
    """

    # depth check is not required for leaf nodes
    if tier == -1:
        return

    max_depth = 0
    classification_project = project.as_categorization()
    taxonomy = classification_project.taxonomy()
    categories = taxonomy.categories()

    for category in categories:
        if len(category.path) > max_depth:
            max_depth = len(category.path)

    if max_depth < tier:
        invalid_tier_value_error = (
            f"Invalid value for tier {tier}. Maximum depth detected is {max_depth}."
        )
        LOGGER.error(invalid_tier_value_error)
        raise ValueError(invalid_tier_value_error)
コード例 #8
0
def _run_custom(project: Project,
                *,
                run_update_unified_dataset=False) -> List[Operation]:
    """Executes specified steps of a schema mapping project.

    Args:
        project: Target schema mapping project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Schema Mapping project
    """
    if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS:
        error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh()
        operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
コード例 #9
0
    def test_refresh_ids(self):
        unified_dataset_url = f"{self._base_url}/projects/1/unifiedDataset"
        datasets_url = f"{self._base_url}/datasets"
        refresh_url = f"{self._base_url}/projects/1/allPublishedClusterIds:refresh"

        responses.add(responses.GET,
                      unified_dataset_url,
                      json=self._unified_dataset_json)
        responses.add(responses.GET, datasets_url, json=self._datasets_json)
        responses.add(responses.POST, refresh_url, json=self._operations_json)

        p = Project(self.tamr, self._project_config_json).as_mastering()
        d = p.published_cluster_ids()

        op = d.refresh(poll_interval_seconds=0)
        self.assertEqual(op.resource_id, self._operations_json["id"])
        self.assertTrue(op.succeeded())
コード例 #10
0
def bootstrap_dataset(
    project: Project, *, source_dataset: Dataset, force_add_dataset_to_project: bool = False
) -> List[AttributeMapping]:
    """
    Bootstraps a dataset (i.e. maps all source columns to themselves)

    Args:
        source_dataset: the source dataset (a Dataset object not a string)
        project: the project to do the mapping ing
        force_add_dataset_to_project: boolean whether to add the dataset to the project
            if it is not already a part of it

    Returns:
        List of the AttributeMappings generated

    Raises:
        RuntimeError: if `source_dataset` is not part of the given `project`, set
            'force_add_dataset_to_project' flag to True to automatically add it
    """

    # check if dataset is in the project - python doesn't handle comparison of Dataset objects
    # well so check on name
    if source_dataset.name not in [x.name for x in project.input_datasets()]:
        if force_add_dataset_to_project:
            LOGGER.info(f"adding dataset {source_dataset.name} to project {project.name}")
            project.add_input_dataset(source_dataset)
        else:
            raise RuntimeError(
                f"dataset {source_dataset.name} not in project {project.name}!"
                + "Set 'force_add_dataset_to_project' flag to True to automatically add it"
            )

    # for each attribute map it
    source_dataset_name = source_dataset.name
    completed_mappings = []
    for attribute in source_dataset.attributes:
        attribute_name = attribute.name
        mapping = map_attribute(
            source_attribute_name=attribute_name,
            source_dataset_name=source_dataset_name,
            unified_attribute_name=attribute_name,
            project=project,
        )
        completed_mappings.append(mapping)

    return completed_mappings
コード例 #11
0
ファイル: test_project.py プロジェクト: skalish/tamr-client
    def test_project_remove_input_dataset(self):
        dataset_id = self.dataset_json[0]["relativeId"]

        responses.add(responses.GET, self.input_datasets_url, json=self.dataset_json)
        responses.add(
            responses.DELETE, f"{self.input_datasets_url}?id={dataset_id}", status=204
        )
        responses.add(responses.GET, self.input_datasets_url, json=[])

        project = Project(self.tamr, self.project_json[0])
        dataset = next(project.input_datasets().stream())

        response = project.remove_input_dataset(dataset)
        self.assertEqual(response.status_code, 204)

        input_datasets = project.input_datasets()
        self.assertEqual(list(input_datasets), [])
コード例 #12
0
ファイル: test_jobs.py プロジェクト: skalish/tamr-toolbox
def test_run_error():
    client = utils.client.create(**CONFIG["toolbox_test_instance"])
    project = Project.from_json(client,
                                resource_json={
                                    "name": "fake",
                                    "type": "NOT_REAL"
                                })

    with pytest.raises(KeyError):
        workflow.jobs.run([project])
コード例 #13
0
    def create(self, creation_spec):
        """
        Create a Project in Tamr

        :param creation_spec: Project creation specification should be formatted as specified in the `Public Docs for Creating a Project <https://docs.tamr.com/reference#create-a-project>`_.
        :type creation_spec: dict[str, str]
        :returns: The created Project
        :rtype: :class:`~tamr_unify_client.project.resource.Project`
        """
        data = self.client.post(self.api_path, json=creation_spec).successful().json()
        return Project.from_json(self.client, data)
コード例 #14
0
def unmap_dataset(
    project: Project,
    *,
    source_dataset: Dataset,
    remove_dataset_from_project: bool = False,
    skip_if_missing: bool = False,
) -> None:
    """
    Wholly unmaps a dataset and optionally removes it from a project.

    Args:
        source_dataset: the source dataset (Dataset object not a string) to unmap
        project: the project in which to unmap the dataset
        remove_dataset_from_project: boolean to also remove the dataset from the project
        skip_if_missing: boolean to skip if dataset is not in project. If set to false and
            dataset is not in project will raise a RuntimeError

    Returns:
        None

    Raises:
        RuntimeError: if `source_dataset` is not in `project` and `skip_if_missing` not set to True
    """

    # check to make sure dataset is in project and log a warning if it is not
    if source_dataset.name not in [x.name for x in project.input_datasets()]:
        if skip_if_missing:
            LOGGER.warning(
                f"Dataset to unmap {source_dataset.name} not in project {project.name}! "
                f"However skip_if_missing flag is set so will do nothing"
            )
            return None
        else:
            error_message = (
                f"Dataset to unmap {source_dataset.name} not in project "
                f"{project.name} and skip_if_missing not set to True so failing! "
            )
            LOGGER.error(error_message)
            raise RuntimeError(error_message)

    # the resource ids of attribute mappings unfortunately change when you delete one
    # so need to just do this until there are no mappings left for the source dataset of interest
    while True:
        mappings = [
            x
            for x in project.attribute_mappings().stream()
            if x.input_dataset_name == source_dataset.name
        ]
        # if no mappings found for this dataset then break
        if not mappings:
            break
        for mapping in mappings:
            # can only delete one then have to break out of inner loop
            project.attribute_mappings().delete_by_resource_id(mapping.resource_id)
            break

    # optionally remove dataset from the project
    if remove_dataset_from_project:
        project.remove_input_dataset(source_dataset)
コード例 #15
0
    def test_create_from_spec(self):
        def create_callback(request, snoop):
            snoop["payload"] = json.loads(request.body)
            return 204, {}, json.dumps(self.created_json)

        url = (
            "http://localhost:9100/api/versioned/v1/projects/1/attributeConfigurations"
        )
        snoop_dict = {}
        responses.add_callback(responses.POST, url,
                               partial(create_callback, snoop=snoop_dict))

        configs = Project(self.tamr,
                          self.project_json).attribute_configurations()
        spec = (AttributeConfigurationSpec.new().with_attribute_name(
            self.create_json["attributeName"]).with_enabled_for_ml(
                self.create_json["enabledForMl"]).with_similarity_function(
                    self.create_json["similarityFunction"]))
        create = configs.create(spec.to_dict())

        self.assertEqual(create.relative_id, self.created_json["relativeId"])
        self.assertEqual(snoop_dict["payload"], self.create_json)
コード例 #16
0
def unmap_attribute(
    project: Project,
    *,
    source_attribute_name: str,
    source_dataset_name: str,
    unified_attribute_name: str,
) -> None:
    """
    Unmaps a source attribute.

    Args:
        source_attribute_name: the name of the source attribute to unmap
        source_dataset_name: the name of the source dataset containing that source attribute
        unified_attribute_name: the unified attribute from which to unmap
        project: the project in which to unmap the attribute

    Returns:
        None
    """

    LOGGER.info(
        f"Trying to remove mapping of source attribute {source_attribute_name} in dataset "
        f"{source_dataset_name} from unified attribute {unified_attribute_name}"
    )

    # get mapping collection
    mapping_collection = project.attribute_mappings()

    # run through and get the resource id of the mapping to remove
    resource_id_to_remove = None
    for mapping in mapping_collection.stream():
        # consider it match if all of source attribute, source dataset and unified attribute
        # are equal
        if (
            source_attribute_name == mapping.input_attribute_name
            and source_dataset_name == mapping.input_dataset_name
            and unified_attribute_name == mapping.unified_attribute_name
        ):
            resource_id_to_remove = mapping.resource_id
            break

    # log warning if resource id wasn't found
    if resource_id_to_remove is None:
        LOGGER.warning(
            f"Mapping of {source_attribute_name} in dataset {source_dataset_name} to "
            f"unified attribute {unified_attribute_name} not found!"
        )
    # if found remove it
    else:
        mapping_collection.delete_by_resource_id(resource_id_to_remove)
コード例 #17
0
    def test_get_versions(self):
        def create_callback(request, snoop):
            snoop["payload"] = request.body
            return 200, {}, "\n".join(
                json.dumps(c) for c in self._versions_json)

        p = Project.from_json(self.tamr, self._project_json).as_mastering()
        post_url = f"http://localhost:9100/api/versioned/v1/{p.api_path}/publishedClusterVersions"
        snoop = {}
        responses.add_callback(responses.POST, post_url,
                               partial(create_callback, snoop=snoop))

        clusters = list(p.published_cluster_versions(self._cluster_ids))
        expected_clusters = [PublishedCluster(c) for c in self._versions_json]

        self.assertEqual(snoop["payload"],
                         "\n".join([json.dumps(i) for i in self._cluster_ids]))
        self.assertEqual(len(clusters), len(expected_clusters))
        for actual, expected in zip(clusters, expected_clusters):
            self.assertEqual(actual.__repr__(), expected.__repr__())
            self.assertEqual(len(actual.versions), len(expected.versions))
コード例 #18
0
def _run_custom(project: Project,
                *,
                run_update_unified_dataset=False,
                process_asynchronously: bool = False) -> List[Operation]:
    """Executes specified steps of a schema mapping project.

    Args:
        project: Target schema mapping project
        run_update_unified_dataset: Whether refresh should be called on the unified dataset
        process_asynchronously: Whether or not to wait for the job to finish before returning
            - must be set to True for concurrent workflow

    Returns:
        The operations that were run

    Raises:
        TypeError: if the `project` is not a Schema Mapping projectgit
    """
    if ProjectType[project.type] != ProjectType.SCHEMA_MAPPING_RECOMMENDATIONS:
        error_msg = f"Cannot use as a schema mapping project. Project type: {project.type}"
        LOGGER.error(error_msg)
        raise TypeError(error_msg)

    completed_operations = []
    if run_update_unified_dataset:
        LOGGER.info(
            f"Updating the unified dataset for project {project.name} (id={project.resource_id})."
        )
        op = project.unified_dataset().refresh(
            asynchronous=process_asynchronously)

        if not process_asynchronously:
            operation.enforce_success(op)
        completed_operations.append(op)

    return completed_operations
コード例 #19
0
ファイル: metrics.py プロジェクト: Datatamer/tamr-toolbox
def get_tier_confidence(
        project: Project,
        *,
        tier: int = -1,
        allow_dataset_refresh: bool = False) -> data_type.JsonDict:
    """
    Extracts tier-specific average confidence from a Tamr internal dataset
    `<unified dataset name>_classifications_average_confidences` in a dictionary

    Args:
        project: Tamr project object
        tier: integer specifying the tier to extract the average confidence;
              default value will return the average confidence at all leaf categories
        allow_dataset_refresh: if True, allows running a job to refresh dataset to make it
                               streamable

    Returns:
        dictionary - keys are category paths, joined by '|' if multi-level taxonomy and values are
        average confidence of the corresponding keys

    Raises:
        RuntimeError: if `dataset` is not streamable and `allow_dataset_refresh` is False;
        TypeError: if tier is not of type int;
                   or if the project type is not classification
        ValueError: if tier is less than -1 or equal to 0
    """
    LOGGER.info(
        f"Retrieving average confidence for taxonomy nodes in project {project.name} "
        f"(id={project.resource_id}).")
    # check project type is categorization
    try:
        project = project.as_categorization()
    except TypeError:
        not_categorization_error = f"Project {project.name} is not a classification project."
        LOGGER.error(not_categorization_error)
        raise TypeError(not_categorization_error)

    # check necessary dataset can be obtained
    dataset = _get_dataset_with_confidence(project)

    # check tier is valid
    if type(tier) is not int:
        wrong_tier_type_error = f"Tier {tier} is not an integer."
        LOGGER.error(wrong_tier_type_error)
        raise TypeError(wrong_tier_type_error)
    if tier < -1 or tier == 0:
        invalid_tier_value_error = (
            f"Invalid value for tier {tier}. Tier cannot be 0 or less than -1."
        )
        LOGGER.error(invalid_tier_value_error)
        raise ValueError(invalid_tier_value_error)

    # check dataset can be streamed
    if not dataset.status().is_streamable:
        if allow_dataset_refresh:
            LOGGER.info(
                f"Refreshing dataset {dataset.name} to make streamable.")
            op = dataset.refresh()
            operation.enforce_success(op)
        else:
            cannot_stream_error = (
                f"Dataset {dataset.name} is not streamable. "
                f"Refresh it first, or run with allow_dataset_refresh=True")
            LOGGER.error(cannot_stream_error)
            raise RuntimeError(cannot_stream_error)

    # check dataset contains necessary attributes
    _check_dataset_with_confidence(dataset)

    # check tier does not exceed maximum taxonomy depth
    _check_taxonomy_depth(project, tier=tier)

    # obtain categories at tier
    selected_category_set = _get_categories_at_tier(project, tier=tier)

    # extract average confidence
    tier_confidence_dict = _extract_confidence(
        dataset=dataset, category_set=selected_category_set)
    return tier_confidence_dict
コード例 #20
0
def map_attribute(
    project: Project,
    *,
    source_attribute_name: str,
    source_dataset_name: str,
    unified_attribute_name: str,
) -> AttributeMapping:
    """
    Maps source_attribute in source_dataset to unified_attribute in unified_dataset.
    If the mapping already exists it will log
    a warning and return the existing AttributeMapping from the project's collection.

    Args:
        source_attribute_name: Source attribute name to map
        source_dataset_name: Source dataset containing the source attribute
        unified_attribute_name: Unified attribute to which to map the source attribute
        project: The project in which to perform the mapping

    Returns:
        AttributeMapping that was created

    Raises:
        ValueError: if input variables `source_attribute_name` or `source_dataset_name` or
            `unified_attribute_name` are set to empty strings;
            or if the dataset `source_dataset_name` is not found on Tamr;
            or if `source_attribute_name` is missing from the attributes of `source_attribute_name`
    """
    # simple validation, nothing should be empty
    variables = [source_attribute_name, source_dataset_name, unified_attribute_name]
    empty_variables = [x for x in variables if x == ""]
    if empty_variables:
        empty_variable_string = ", ".join(empty_variables)
        error_message = (
            f"The following variables are set to empty strings and "
            f"need to be filled in: {empty_variable_string} !"
        )
        LOGGER.error(error_message)
        raise ValueError(error_message)

    # also validate that the dataset exists and has this column
    try:
        source_dataset = project.client.datasets.by_name(source_dataset_name)
    except KeyError:
        error_msg = f"Dataset {source_dataset_name} not found!"
        LOGGER.error(error_msg)
        raise ValueError(error_msg)

    try:
        assert source_attribute_name in [x.name for x in source_dataset.attributes]
    except AssertionError:
        error_msg = f"Attribute {source_attribute_name} not found in {source_dataset_name}!"
        LOGGER.error(error_msg)
        raise ValueError(error_msg)

    # generate mapping spec
    mapping_spec = _get_mapping_spec_for_ud(
        source_attr_name=source_attribute_name,
        source_ds_name=source_dataset_name,
        unified_attr_name=unified_attribute_name,
        unified_ds_name=project.unified_dataset().name,
    )

    # add the mapping to the project's collection - this is what does the actual mapping
    try:
        return project.attribute_mappings().create(mapping_spec.to_dict())
    except JSONDecodeError as e:
        # can get a jsondecode error if the attribute is already mapped.
        # If it is, then log a warning and return empty mapping
        # if it is not already mapped break loudly
        m: AttributeMapping
        for m in project.attribute_mappings().stream():
            if (
                m.input_dataset_name == source_dataset_name
                and m.input_attribute_name == source_attribute_name
                and m.unified_attribute_name == unified_attribute_name
            ):
                # mapping exists, log warning and return existing mapping
                LOGGER.warning(
                    f"mapping of attribute {source_attribute_name} in dataset "
                    f"{source_dataset_name} to unified attribute {unified_attribute_name} "
                    f"already exists! Returning existing mapping spec"
                )
                return m

        # if haven't returned then throw the JSONDecodeError
        raise e
コード例 #21
0
ファイル: test_project.py プロジェクト: minroh/tamr-client
 def test_project_get_input_datasets(self):
     p = Project(self.tamr, self.project_json[0])
     datasets = p.input_datasets()
     self.assertEqual(datasets.api_path, "projects/1/inputDatasets")
コード例 #22
0
def from_taxonomy(
    project: Project,
    export_file_path: Union[Path, str],
    *,
    csv_delimiter: str = ",",
    flatten_delimiter: str = "|",
    quote_character: str = '"',
    quoting: int = csv.QUOTE_MINIMAL,
    overwrite: bool = False,
) -> int:
    """
    Export a Tamr taxonomy to a csv file. Records are streamed to disk and written according to a
    given buffer size.

    Args:
        project: Tamr Project object
        export_file_path: Path to the csv file where the dataset will be saved
        csv_delimiter: Delimiter of the csv file
        flatten_delimiter: Flatten list types to strings by concatenating with this delimiter
        quote_character: Character used to escape value for csv delimiter when it appears in the
            value.
        quoting: The escape strategy to use according to the Python csv writer.
            See https://docs.python.org/2/library/csv.html#csv.QUOTE_MINIMAL
        overwrite: if True and export_file_name already exists, overwrite the file.
            Otherwise throw an error

    Returns:
        The total number of records written

    Raises:
        FileExistsError: if `export_file_path` exists and `overwrite` is set to False
        IOError: if the specified filepath does not exist or cannot be accessed
        RuntimeError: if the classification project is not yet associated with a taxonomy or
                      taxonomy cannot be written to a csv file
        TypeError: if the project type is not classification
        ValueError: if `columns` and `flatten_columns` are identical values
    """
    LOGGER.info(
        f"Streaming taxonomy to csv file {export_file_path} from project {project.name} "
        f"(project id={project.resource_id}).")

    try:
        project = project.as_categorization()
    except TypeError:
        not_categorization_error = f"Project {project.name} is not a classification project."
        LOGGER.error(not_categorization_error)
        raise TypeError(not_categorization_error)

    if os.path.exists(export_file_path):
        if not overwrite:
            message = (
                f"CSV file {export_file_path} already exists. "
                f"(Set 'overwrite' flag to True if you wish to overwrite)")
            LOGGER.error(message)
            raise FileExistsError(message)
        else:
            LOGGER.warning(
                f"CSV file {export_file_path} already exists and will be overwritten"
            )

    if csv_delimiter == flatten_delimiter:
        message = (
            f"The CSV delimiter '{csv_delimiter}' cannot be identical to the list "
            f"flattening delimiter '{flatten_delimiter}'")
        LOGGER.error(message)
        raise ValueError(message)

    try:
        taxonomy = project.as_categorization().taxonomy()
    except requests.exceptions.RequestException:
        no_taxonomy_error = f"Project {project.name} is not associated with any taxonomy yet."
        LOGGER.error(no_taxonomy_error)
        raise RuntimeError(no_taxonomy_error)

    # obtain categories and store in a list
    categories = taxonomy.categories()
    taxonomy_list = []
    for category in categories:
        taxonomy_list.append(category.path)

    # sort the categories
    taxonomy_list.sort()

    # Open CSV file and use newline='' as recommended by
    # https://docs.python.org/3/library/csv.html#csv.writer

    try:
        f = open(export_file_path, "w", newline="", encoding="utf-8")
    except (FileNotFoundError, IOError, PermissionError):
        cannot_open_error = f"File path {export_file_path} could not be opened for writing."
        LOGGER.error(cannot_open_error)
        raise IOError(cannot_open_error)
    else:
        try:
            csv_writer = csv.writer(
                f,
                delimiter=csv_delimiter,
                quotechar=quote_character,
                quoting=quoting,
            )
            csv_writer.writerows(taxonomy_list)
        except csv.Error as e:
            general_error = (
                "Encountered an error while writing taxonomy categories to "
                f"{export_file_path}: {e}")
            f.close()
            LOGGER.error(general_error)
            raise RuntimeError(general_error)
        finally:
            f.close()

    records_written = len(taxonomy_list)
    LOGGER.info(
        f"Wrote {records_written} categories from {project.name} taxonomy (project id"
        f"={project.resource_id}) to {export_file_path}")
    return records_written