Esempio n. 1
0
    def remove_semantic_tags(self, semantic_tags):
        """Remove the semantic tags for any column names in the provided semantic_tags
        dictionary, updating the Woodwork typing information. Including `index` or `time_index`
        tags will set the Woodwork index or time index to None for the DataFrame.

        Args:
            semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns
                in the DataFrame to the tags that should be removed from the column's semantic tags
        """
        _check_semantic_tags(self.columns.keys(), semantic_tags)
        for col_name, tags_to_remove in semantic_tags.items():
            standard_tags = self.logical_types[col_name].standard_tags
            tags_to_remove = _convert_input_to_set(tags_to_remove)
            original_tags = self.semantic_tags[col_name].copy()

            self.columns[col_name]._remove_semantic_tags(
                tags_to_remove, col_name)

            # If the index is removed, reinsert any standard tags not explicitly removed
            if (self.use_standard_tags[col_name] and "index" in original_tags
                    and "index" not in self.columns[col_name].semantic_tags):
                standard_tags_removed = tags_to_remove.intersection(
                    standard_tags)
                standard_tags_to_reinsert = standard_tags.difference(
                    standard_tags_removed)
                self.columns[col_name].semantic_tags = self.semantic_tags[
                    col_name].union(standard_tags_to_reinsert)
Esempio n. 2
0
    def reset_semantic_tags(self, columns=None, retain_index_tags=False):
        """Reset the semantic tags for the specified columns to the default values.
        The default values will be either an empty set or a set of the standard tags
        based on the column logical type, controlled by the use_standard_tags property on the table.
        Column names can be provided as a single string, a list of strings or a set of strings.
        If columns is not specified, tags will be reset for all columns.

        Args:
            columns (str/list/set, optional): The columns for which the semantic tags should be reset.
            retain_index_tags (bool, optional): If True, will retain any index or time_index
                semantic tags set on the column. If False, will clear all semantic tags. Defaults to
                False.
        """
        columns = _convert_input_to_set(columns, "columns")
        cols_not_found = sorted(
            list(columns.difference(set(self.columns.keys()))))
        if cols_not_found:
            raise ColumnNotPresentError(cols_not_found)
        if not columns:
            columns = self.columns.keys()

        for col_name in columns:
            original_tags = self.semantic_tags[col_name]
            self.columns[col_name]._reset_semantic_tags()

            if retain_index_tags and "index" in original_tags:
                self._set_index_tags(col_name)
            if retain_index_tags and "time_index" in original_tags:
                self._set_time_index_tags(col_name)
Esempio n. 3
0
    def remove_semantic_tags(self, semantic_tags):
        """Removes specified semantic tags from column and returns a new column.

        Args:
            semantic_tags (str/list/set): Semantic tag(s) to remove from the column.

        Returns:
            woodwork.DataColumn: DataColumn with specified tags removed.
        """
        tags_to_remove = _convert_input_to_set(semantic_tags)
        invalid_tags = sorted(
            list(tags_to_remove.difference(self._semantic_tags)))
        if invalid_tags:
            raise LookupError(
                f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{self.name}'"
            )
        standard_tags_to_remove = sorted(
            list(tags_to_remove.intersection(
                self._logical_type.standard_tags)))
        if standard_tags_to_remove and self.use_standard_tags:
            warnings.warn(
                StandardTagsRemovalWarning().get_warning_message(
                    standard_tags_to_remove, self.name),
                StandardTagsRemovalWarning)
        new_tags = self._semantic_tags.difference(tags_to_remove)
        return DataColumn(series=self._series,
                          logical_type=self.logical_type,
                          semantic_tags=new_tags,
                          use_standard_tags=False)
Esempio n. 4
0
    def set_semantic_tags(self, semantic_tags, retain_index_tags=True):
        """Replace current semantic tags with new values and return a new DataColumn object.

        Args:
            semantic_tags (str/list/set): New semantic tag(s) to set for column
            retain_index_tags (bool, optional): If True, any 'index' or 'time_index' tags on
                the column will be retained. If False, all tags will be replaced.
                Defaults to True.

        Returns:
            woodwork.DataColumn: DataColumn with specified semantic tags.
        """
        semantic_tags = _convert_input_to_set(semantic_tags)
        _validate_tags(semantic_tags)
        is_index = 'index' in self._semantic_tags
        is_time_index = 'time_index' in self._semantic_tags
        new_col = DataColumn(series=self._series,
                             logical_type=self.logical_type,
                             semantic_tags=semantic_tags,
                             use_standard_tags=self.use_standard_tags)
        if new_col.use_standard_tags:
            new_col._semantic_tags = new_col._semantic_tags.union(
                new_col._logical_type.standard_tags)
        if retain_index_tags and is_index:
            new_col._set_as_index()
        if retain_index_tags and is_time_index:
            new_col._set_as_time_index()
        return new_col
Esempio n. 5
0
    def _get_column_tags(self, semantic_tags, validate):
        semantic_tags = _convert_input_to_set(
            semantic_tags, error_language="semantic_tags", validate=validate
        )

        if self.use_standard_tags:
            if self.logical_type is None:
                raise ValueError("Cannot use standard tags when logical_type is None")
            semantic_tags = semantic_tags.union(self.logical_type.standard_tags)

        return semantic_tags
Esempio n. 6
0
    def _set_semantic_tags(self, semantic_tags):
        """Replace current semantic tags with new values. If use_standard_tags is set
        to True, standard tags will be added as well.

        Args:
            semantic_tags (str/list/set): New semantic tag(s) to set
        """
        semantic_tags = _convert_input_to_set(semantic_tags)

        if self.use_standard_tags:
            semantic_tags = semantic_tags.union(self.logical_type.standard_tags)

        self.semantic_tags = semantic_tags
Esempio n. 7
0
    def add_semantic_tags(self, semantic_tags):
        """Adds specified semantic tags to columns, updating the Woodwork typing information.
        Will retain any previously set values.

        Args:
            semantic_tags (dict[str -> str/list/set]): A dictionary mapping the columns
                in the DataFrame to the tags that should be added to the column's semantic tags
        """
        _check_semantic_tags(self.columns.keys(), semantic_tags)
        for col_name, tags_to_add in semantic_tags.items():
            tags_to_add = _convert_input_to_set(tags_to_add)
            _validate_not_setting_index_tags(tags_to_add, col_name)
            self.columns[col_name]._add_semantic_tags(tags_to_add, col_name)
Esempio n. 8
0
def test_validation_methods_called(mock_validate_input_type, mock_validate_strings):
    assert not mock_validate_input_type.called
    assert not mock_validate_strings.called

    _convert_input_to_set("test_tag", validate=False)
    assert not mock_validate_input_type.called

    _convert_input_to_set("test_tag", validate=True)
    assert mock_validate_input_type.called

    _convert_input_to_set(["test_tag", "tag2"], validate=False)
    assert not mock_validate_strings.called

    _convert_input_to_set(["test_tag", "tag2"], validate=True)
    assert mock_validate_strings.called
Esempio n. 9
0
def _set_semantic_tags(semantic_tags, standard_tags, use_standard_tags):
    """Replace current semantic tags with new values. If use_standard_tags is set
    to True, standard tags will be added as well.

    Args:
        semantic_tags (str/list/set): New semantic tag(s) to set
        standard_tags (set): Set of standard tags for the column logical type
        use_standard_tags (bool): If True, retain standard tags after reset
    """
    semantic_tags = _convert_input_to_set(semantic_tags)

    if use_standard_tags:
        semantic_tags = semantic_tags.union(standard_tags)

    return semantic_tags
Esempio n. 10
0
def test_convert_input_to_set():
    error_message = "semantic_tags must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set(int)

    error_message = "test_text must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set({'index': {}, 'time_index': {}}, 'test_text')

    error_message = "include parameter must contain only strings"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set(['index', 1], 'include parameter')

    semantic_tags_from_single = _convert_input_to_set('index', 'include parameter')
    assert semantic_tags_from_single == {'index'}

    semantic_tags_from_list = _convert_input_to_set(['index', 'numeric', 'category'])
    assert semantic_tags_from_list == {'index', 'numeric', 'category'}

    semantic_tags_from_set = _convert_input_to_set({'index', 'numeric', 'category'}, 'include parameter')
    assert semantic_tags_from_set == {'index', 'numeric', 'category'}
Esempio n. 11
0
    def _add_semantic_tags(self, new_tags, name):
        """Add the specified semantic tags to the current set of tags

        Args:
            new_tags (str/list/set): The new tags to add
            name (str): Name of the column to use in warning
        """
        new_tags = _convert_input_to_set(new_tags)

        duplicate_tags = sorted(list(
            self.semantic_tags.intersection(new_tags)))
        if duplicate_tags:
            warnings.warn(
                DuplicateTagsWarning().get_warning_message(
                    duplicate_tags, name), DuplicateTagsWarning)
        self.semantic_tags = self.semantic_tags.union(new_tags)
Esempio n. 12
0
    def __init__(self,
                 series,
                 logical_type=None,
                 semantic_tags=None,
                 use_standard_tags=True,
                 name=None,
                 description=None,
                 metadata=None):
        """Create a DataColumn.

        Args:
            series (pd.Series or dd.Series or numpy.ndarray or pd.api.extensions.ExtensionArray): Series containing the data associated with the column.
            logical_type (LogicalType, optional): The logical type that should be assigned
                to the column. If no value is provided, the LogicalType for the series will
                be inferred.
            semantic_tags (str or list or set, optional): Semantic tags to assign to the column.
                Defaults to an empty set if not specified. There are two options for
                specifying the semantic tags:
                (str) If only one semantic tag is being set, a single string can be passed.
                (list or set) If multiple tags are being set, a list or set of strings can be passed.
            use_standard_tags (bool, optional): If True, will add standard semantic tags to columns based
                on the inferred or specified logical type for the column. Defaults to True.
            name (str, optional): Name of DataColumn. Will overwrite Series name, if it exists.
            description (str, optional): Optional text describing the contents of the column
            metadata (dict[str -> json serializable], optional): Metadata associated with the column.
        """
        self._assigned_name = name
        self._set_series(series)
        self.use_standard_tags = use_standard_tags
        self._logical_type = self._parse_logical_type(logical_type)
        semantic_tags = _convert_input_to_set(semantic_tags)
        _validate_tags(semantic_tags)
        if use_standard_tags:
            semantic_tags = semantic_tags.union(
                self.logical_type.standard_tags)
        self._semantic_tags = semantic_tags
        self._update_dtype()

        if description and not isinstance(description, str):
            raise TypeError("Column description must be a string")
        self.description = description

        if metadata and not isinstance(metadata, dict):
            raise TypeError("Column metadata must be a dictionary")
        self.metadata = metadata or {}
Esempio n. 13
0
def test_convert_input_to_set():
    error_message = "semantic_tags must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set(int)

    error_message = "test_text must be a string, set or list"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set({"index": {}, "time_index": {}}, "test_text")

    error_message = "include parameter must contain only strings"
    with pytest.raises(TypeError, match=error_message):
        _convert_input_to_set(["index", 1], "include parameter")

    semantic_tags_from_single = _convert_input_to_set("index", "include parameter")
    assert semantic_tags_from_single == {"index"}

    semantic_tags_from_list = _convert_input_to_set(["index", "numeric", "category"])
    assert semantic_tags_from_list == {"index", "numeric", "category"}

    semantic_tags_from_set = _convert_input_to_set(
        {"index", "numeric", "category"}, "include parameter"
    )
    assert semantic_tags_from_set == {"index", "numeric", "category"}
Esempio n. 14
0
    def add_semantic_tags(self, semantic_tags):
        """Add the specified semantic tags to the column and return a new DataColumn object.

        Args:
            semantic_tags (str/list/set): New semantic tag(s) to add to the column

        Returns:
            woodwork.DataColumn: DataColumn with specified semantic tags added.
        """
        new_tags = _convert_input_to_set(semantic_tags)
        _validate_tags(new_tags)
        duplicate_tags = sorted(
            list(self._semantic_tags.intersection(new_tags)))
        if duplicate_tags:
            warnings.warn(
                DuplicateTagsWarning().get_warning_message(
                    duplicate_tags, self.name), DuplicateTagsWarning)
        new_col_tags = self._semantic_tags.union(new_tags)
        new_col = DataColumn(series=self._series,
                             logical_type=self.logical_type,
                             semantic_tags=new_col_tags,
                             use_standard_tags=self.use_standard_tags)
        return new_col
Esempio n. 15
0
    def _create_columns(
        self,
        column_names,
        logical_types,
        semantic_tags,
        use_standard_tags,
        column_descriptions,
        column_origins,
        column_metadata,
        validate,
    ):
        """Create a dictionary with column names as keys and new column dictionaries holding
        each column's typing information as values."""
        columns = {}
        for name in column_names:
            semantic_tags_for_col = _convert_input_to_set(
                (semantic_tags or {}).get(name),
                error_language=f"semantic_tags for {name}",
                validate=validate,
            )
            if validate:
                _validate_not_setting_index_tags(semantic_tags_for_col, name)
            description = (column_descriptions or {}).get(name)
            origin = (column_origins if isinstance(column_origins, str) else
                      (column_origins or {}).get(name))
            metadata_for_col = (column_metadata or {}).get(name)

            columns[name] = ColumnSchema(
                logical_type=logical_types.get(name),
                semantic_tags=semantic_tags_for_col,
                use_standard_tags=use_standard_tags.get(name),
                description=description,
                origin=origin,
                metadata=metadata_for_col,
                validate=validate,
            )
        return columns
Esempio n. 16
0
    def _remove_semantic_tags(self, tags_to_remove, name):
        """Removes specified semantic tags from from the current set of tags

        Args:
            tags_to_remove (str/list/set): The tags to remove
            name (str): Name of the column to use in warning
        """
        tags_to_remove = _convert_input_to_set(tags_to_remove)
        invalid_tags = sorted(
            list(tags_to_remove.difference(self.semantic_tags)))
        if invalid_tags:
            raise LookupError(
                f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{name}'"
            )

        if self.use_standard_tags and sorted(
                list(
                    tags_to_remove.intersection(
                        self.logical_type.standard_tags))):
            warnings.warn(
                StandardTagsChangedWarning().get_warning_message(
                    not self.use_standard_tags, name),
                StandardTagsChangedWarning)
        self.semantic_tags = self.semantic_tags.difference(tags_to_remove)
Esempio n. 17
0
def _remove_semantic_tags(tags_to_remove, current_tags, name, standard_tags,
                          use_standard_tags):
    """Removes specified semantic tags from from the current set of tags

    Args:
        tags_to_remove (str/list/set): The tags to remove
        current_tags (set): Current set of semantic tags
        name (str): Name of the column to use in warning
        standard_tags (set): Set of standard tags for the column logical type
        use_standard_tags (bool): If True, warn if user attempts to remove a standard tag
    """
    tags_to_remove = _convert_input_to_set(tags_to_remove)
    invalid_tags = sorted(list(tags_to_remove.difference(current_tags)))
    if invalid_tags:
        raise LookupError(
            f"Semantic tag(s) '{', '.join(invalid_tags)}' not present on column '{name}'"
        )
    standard_tags_to_remove = sorted(
        list(tags_to_remove.intersection(standard_tags)))
    if standard_tags_to_remove and use_standard_tags:
        warnings.warn(
            StandardTagsChangedWarning().get_warning_message(
                not use_standard_tags, name), StandardTagsChangedWarning)
    return current_tags.difference(tags_to_remove)