Beispiel #1
0
def delete_attributes(
    *,
    dataset: Dataset,
    attributes: Iterable[str] = None,
) -> Dataset:
    """Remove attributes from dataset by attribute name

    Args:
        dataset: An existing TUC dataset
        attributes: list of attribute names to delete from dataset

    Returns:
        Updated Dataset

    Raises:
        ValueError: If the dataset is not a source dataset
        ValueError: If a passed attribute does not exist in the dataset
        ValueError: If a passed attribute is a primary key and can't be removed
        TypeError: If the attributes argument is not an Iterable
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check input type is correct
    if not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Get current dataset attributes
    target_attribute_dict = {attr.name: attr for attr in dataset.attributes}
    existing_attributes = target_attribute_dict.keys()
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check all attributes exist before starting to remove any
    for attribute_name in attributes:
        if attribute_name not in existing_attributes:
            raise ValueError(
                f"The attribute '{attribute_name}' does not exist in {dataset_name}"
            )
        elif attribute_name in primary_keys:
            # Can not edit a primary key
            raise ValueError(
                f"The attribute '{attribute_name}' is a primary key and can't be removed"
            )

    # Remove attributes from dataset
    for attribute_name in attributes:
        dataset.attributes.delete_by_resource_id(
            target_attribute_dict[attribute_name].resource_id)
        LOGGER.info(f"Deleted attribute '{attribute_name}' in {dataset_name}")

    return dataset
Beispiel #2
0
def _request_upstream_datasets(dataset: Dataset) -> Dataset:
    """ Returns a dataset's upstream dataset

        Args:
            dataset: a Tamr Dataset Object
        Returns:
            The upstream datasets
    """
    # Find upstream datasets, output is a DatasetURI
    upstream = dataset.upstream_datasets()
    dataset_upstream = []
    # Make Dataset our of DatasetURI
    for data in upstream:
        dataset_upstream.append(
            dataset.client.datasets.by_resource_id(data.resource_id))
    return dataset_upstream
Beispiel #3
0
def edit_attributes(
    *,
    dataset: Dataset,
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
    override_existing_types: bool = True,
) -> Dataset:
    """Edit existing attributes in a dataset

    The attribute type and/or descriptions can be updated to new values. Attributes that will be
    updated must be in either the attribute_types or attribute_descriptions dictionaries or
    both. The default attribute type will be ARRAY STRING. To set non-default attribute types, they
    must be defined in the attribute_types dictionary. Any attribute descriptions can be specified
    in the attribute_descriptions dictionary. If only the attribute_descriptions dictionary is
    defined, the attribute type will not be updated.

    Args:
        dataset: An existing TUC dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value
        override_existing_types: bool flag, when true will alter existing attributes

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        ValueError: If the dataset is not a source dataset
        ValueError: If a passed attribute does not exist in the dataset
        ValueError: If a passed attribute is a primary key and can't be removed
        ValueError: If there are no updates to attributes in attribute_types or
            attribute_descriptions arguments
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check description or type changes are passed in
    if attribute_types is None and attribute_descriptions is None:
        raise ValueError(
            """Updates to attributes must be passed in via attribute_types
            or attribute_descriptions arguments""")

    # Get list of attributes that need updating from attribute_types and
    # attribute_descriptions dictionaries
    attributes = {attr
                  for attr in attribute_types or list()
                  } | {attr
                       for attr in attribute_descriptions or list()}

    # Get current dataset attributes
    target_attribute_dict = {attr.name: attr for attr in dataset.attributes}
    existing_attributes = target_attribute_dict.keys()
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check that all of the attribute names already exist in dataset
    for attribute_name in attributes:
        if attribute_name not in existing_attributes:
            # This attribute does not exist
            raise ValueError(
                f"An attribute with name '{attribute_name}' does not exist in {dataset_name}"
            )
        elif attribute_name in primary_keys:
            # Can not edit a primary key
            raise ValueError(
                f"The attribute '{attribute_name}' is a primary key and can't be updated"
            )

    # Update attributes in dataset
    for attribute_name in attributes:
        attr_spec_dict = _make_spec_dict(
            attribute_name=attribute_name,
            attribute_types=attribute_types,
            attribute_descriptions=attribute_descriptions,
        )
        existing_attribute_spec = target_attribute_dict[attribute_name].spec()
        if attribute_types is None or attribute_name not in attribute_types:
            new_type_class = attribute_type.from_json(
                existing_attribute_spec.to_dict()["type"])
        else:
            new_type_class = attribute_type.from_json(attr_spec_dict["type"])
        old_type_class = attribute_type.from_json(
            existing_attribute_spec.to_dict()["type"])

        if new_type_class == old_type_class:
            # Update description
            if (attribute_descriptions is not None
                    and attribute_name in attribute_descriptions.keys()):
                existing_attribute_spec = existing_attribute_spec.with_description(
                    attribute_descriptions[attribute_name])
                existing_attribute_spec.put()
            else:
                LOGGER.info(
                    f"There are no updates to the attribute '{attribute_name}' in {dataset_name}"
                )
        elif override_existing_types:
            # Update type
            new_attr_spec = existing_attribute_spec.to_dict()
            new_attr_spec["type"] = attr_spec_dict["type"]

            # Update description
            if "description" in attr_spec_dict.keys():
                new_attr_spec["description"] = attr_spec_dict["description"]

            # Remove and add attribute with new spec
            dataset.attributes.delete_by_resource_id(
                target_attribute_dict[attribute_name].resource_id)
            dataset.attributes.create(new_attr_spec)
            LOGGER.info(
                f"Updated attribute '{attribute_name}' in {dataset_name}")
        else:
            LOGGER.info(
                f"""The attribute '{attribute_name}' in {dataset_name} curently has
                 the type '{str(old_type_class)}'. Set 'override_existing_types' to
                 True to update the type to '{str(new_type_class)}'
                """)

    return dataset
Beispiel #4
0
def create_attributes(
    *,
    dataset: Dataset,
    attributes: Iterable[str],
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
) -> Dataset:
    """Create new attributes in a dataset

    The default attribute type will be ARRAY STRING. To set non-default attribute types, they must
    be defined in the attribute_types dictionary. Any attribute descriptions can be specified in
    the attribute_descriptions dictionary.

    Args:
        dataset: An existing TUC dataset
        attributes: list of attribute names to be added to dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        TypeError: If the attributes argument is not an Iterable
        ValueError: If the dataset is a unified dataset
        ValueError: If an attribute passed in already exists in the dataset

    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")

    # Check input type is correct
    if not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Get current dataset attributes
    existing_attributes = [attr.name for attr in dataset.attributes]

    # Check that none of the new attribute names already exist
    for attribute_name in attributes:
        if attribute_name in existing_attributes:
            # This attribute already exists
            raise ValueError(
                f"An attribute with name '{attribute_name}' already exists in {dataset_name}"
            )

    # Add attributes to dataset
    for attribute_name in attributes:
        attr_spec_dict = _make_spec_dict(
            attribute_name=attribute_name,
            attribute_types=attribute_types,
            attribute_descriptions=attribute_descriptions,
        )
        dataset.attributes.create(attr_spec_dict)
        LOGGER.info(f"Created attribute '{attribute_name}' in {dataset_name}")

    return dataset
Beispiel #5
0
def update(
    dataset: Dataset,
    *,
    attributes: Optional[Iterable[str]] = None,
    attribute_types: Optional[Dict[str, attribute_type.AttributeType]] = None,
    attribute_descriptions: Optional[Dict[str, str]] = None,
    description: Optional[str] = None,
    tags: Optional[List[str]] = None,
    override_existing_types: bool = False,
) -> Dataset:
    """Flexibly update a source dataset in Tamr

    All the attributes that should exist in the dataset must be defined in the attributes argument.
    This function will add/remove attributes in the dataset until the dataset attributes matches
    the set of attributes passed in as an argument. The default attribute type will be ARRAY
    STRING . To set non-default attribute types, they must be defined in the attribute_types
    dictionary. Any attribute descriptions can be specified in the attribute_descriptions
    dictionary. By default, the existing attribute types will not change unless
    override_existing_types is set to True. When False, the attribute type updates will only be
    logged.

    Args:
        dataset: An existing TUC dataset
        attributes: Complete list of attribute names that should exist in the updated dataset
        attribute_types: dictionary for non-default types, attribute name is the key and
            AttributeType is the value
        attribute_descriptions: dictionary for attribute descriptions, attribute name is the
            key and the attribute description is the value
        description: updated description of dataset, if None will not update the description
        tags: updated tags for the dataset, if None will not update tags
        override_existing_types: boolean flag, when true will alter existing attribute's types

    Returns:
        Updated Dataset

    Raises:
        requests.HTTPError: If any HTTP error is encountered
        ValueError: If the dataset is not a source dataset
        TypeError: If the attributes argument is not an Iterable

    Example:
        >>> import tamr_toolbox as tbox
        >>> from tbox.models import attribute_type
        >>> tamr_client = tbox.utils.client.create(**instance_connection_info)
        >>> dataset = = tamr_client.datasets.by_name("my_dataset_name")
        >>> tbox.dataset.manage.update(
        >>>     client=tamr_client,
        >>>     dataset=dataset,
        >>>     attributes=["unique_id","name","address","total_sales"],
        >>>     attribute_types={"total_sales":attribute_type.ARRAY(attribute_type.DOUBLE)},
        >>>     override_existing_types = True,
        >>> )
    """
    dataset_name = dataset.name
    if dataset.upstream_datasets():
        raise ValueError(f"{dataset_name} is not a source dataset")
    primary_keys = dataset.spec().to_dict()["keyAttributeNames"]

    # Check input type is correct
    if attributes and not isinstance(attributes, Iterable):
        raise TypeError("attributes arg must be an Iterable")

    # Update description and tags
    dataset_spec = dataset.spec()
    if description:
        dataset_spec = dataset_spec.with_description(description)
        LOGGER.info(f"Updating description for {dataset_name}")
    if tags:
        dataset_spec = dataset_spec.with_tags(tags)
        LOGGER.info(f"Updating tags for {dataset_name}")

    dataset_spec.put()

    if attributes:
        # Get current dataset attributes
        existing_attributes = [attr.name for attr in dataset.attributes]

        # Update attributes in dataset
        for attribute_name in attributes:
            if attribute_name in primary_keys:
                continue
            elif attribute_name in existing_attributes:
                # This attribute already exists, update to new type
                type_dict = {
                    attribute_name: (attribute_types
                                     or dict()).get(attribute_name,
                                                    attribute_type.DEFAULT)
                }
                desc_dict = {
                    attribute_name: (attribute_descriptions
                                     or dict()).get(attribute_name)
                }

                edit_attributes(
                    dataset=dataset,
                    attribute_types=type_dict,
                    attribute_descriptions=desc_dict,
                    override_existing_types=override_existing_types,
                )
            else:
                # This attribute does not already exist, create
                create_attributes(
                    dataset=dataset,
                    attributes=[attribute_name],
                    attribute_types=attribute_types,
                    attribute_descriptions=attribute_descriptions,
                )

        # Remove any attributes from dataset that aren't in the new list of attributes
        for attribute_name in existing_attributes:
            if attribute_name not in attributes and attribute_name not in primary_keys:
                delete_attributes(dataset=dataset, attributes=[attribute_name])

    return dataset