コード例 #1
0
    def __getitem__(self, key):
        if self._schema is None:
            _raise_init_error()
        if isinstance(key, list):
            columns = set(self._dataframe.columns)
            diff = list(set(key).difference(columns))

            if diff:
                raise ColumnNotPresentError(sorted(diff))

            return self._get_subset_df_with_schema(key,
                                                   use_dataframe_order=False)

        if key not in self._dataframe:
            raise ColumnNotPresentError(key)

        series = self._dataframe[key]
        column = copy.deepcopy(self._schema.columns[key])
        column.semantic_tags -= {'index', 'time_index'}
        if column.use_standard_tags:
            column.semantic_tags |= column.logical_type.standard_tags

        series.ww.init(schema=column, validate=False)

        return series
コード例 #2
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
    def rename(self, columns):
        """Renames columns in a TableSchema

        Args:
            columns (dict[str -> str]): A dictionary mapping current column names to new column names.

        Returns:
            woodwork.TableSchema: TableSchema with the specified columns renamed.
        """
        if not isinstance(columns, dict):
            raise TypeError("columns must be a dictionary")

        for old_name, new_name in columns.items():
            if old_name not in self.columns:
                raise ColumnNotPresentError(
                    f"Column to rename must be present. {old_name} cannot be found."
                )
            if new_name in self.columns and new_name not in columns.keys():
                raise ValueError(
                    f"The column {new_name} is already present. Please choose another name to rename {old_name} to or also rename {old_name}."
                )

        if len(columns) != len(set(columns.values())):
            raise ValueError(
                "New columns names must be unique from one another.")

        new_schema = copy.deepcopy(self)

        cols_to_update = {}
        for old_name, new_name in columns.items():
            col = new_schema.columns.pop(old_name)
            cols_to_update[new_name] = col

        new_schema.columns.update(cols_to_update)
        return new_schema
コード例 #3
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
    def reset_semantic_tags(self, columns=None, retain_index_tags=False):
        """Reset the semantic tags for the specified columns to the default values.
        The default values will be either an empty set or a set of the standard tags
        based on the column logical type, controlled by the use_standard_tags property on the table.
        Column names can be provided as a single string, a list of strings or a set of strings.
        If columns is not specified, tags will be reset for all columns.

        Args:
            columns (str/list/set, optional): The columns for which the semantic tags should be reset.
            retain_index_tags (bool, optional): If True, will retain any index or time_index
                semantic tags set on the column. If False, will clear all semantic tags. Defaults to
                False.
        """
        columns = _convert_input_to_set(columns, "columns")
        cols_not_found = sorted(
            list(columns.difference(set(self.columns.keys()))))
        if cols_not_found:
            raise ColumnNotPresentError(cols_not_found)
        if not columns:
            columns = self.columns.keys()

        for col_name in columns:
            original_tags = self.semantic_tags[col_name]
            self.columns[col_name]._reset_semantic_tags()

            if retain_index_tags and "index" in original_tags:
                self._set_index_tags(col_name)
            if retain_index_tags and "time_index" in original_tags:
                self._set_time_index_tags(col_name)
コード例 #4
0
    def drop(self, columns):
        """Drop specified columns from a DataFrame.

        Args:
            columns (str or list[str]): Column name or names to drop. Must be present in the DataFrame.

        Returns:
            DataFrame: DataFrame with the specified columns removed, maintaining Woodwork typing information.

        Note:
            This method is used for removing columns only. To remove rows with ``drop``, go through the
            DataFrame directly and then reinitialize Woodwork with ``DataFrame.ww.init``
            instead of calling ``DataFrame.ww.drop``.
        """
        if self._schema is None:
            _raise_init_error()
        if not isinstance(columns, (list, set)):
            columns = [columns]

        not_present = [
            col for col in columns if col not in self._dataframe.columns
        ]
        if not_present:
            raise ColumnNotPresentError(not_present)

        return self._get_subset_df_with_schema(
            [col for col in self._dataframe.columns if col not in columns])
コード例 #5
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_column_metadata(column_names, column_metadata):
    if not isinstance(column_metadata, dict):
        raise TypeError("Column metadata must be a dictionary.")
    cols_not_found = set(column_metadata.keys()).difference(set(column_names))
    if cols_not_found:
        raise ColumnNotPresentError(
            "column_metadata contains columns that are not present in "
            f"TableSchema: {sorted(list(cols_not_found))}")
コード例 #6
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_column_descriptions(column_names, column_descriptions):
    if not isinstance(column_descriptions, dict):
        raise TypeError("column_descriptions must be a dictionary")
    cols_not_found = set(column_descriptions.keys()).difference(
        set(column_names))
    if cols_not_found:
        raise ColumnNotPresentError(
            "column_descriptions contains columns that are not present in "
            f"TableSchema: {sorted(list(cols_not_found))}")
コード例 #7
0
def _check_logical_types(dataframe_columns, logical_types):
    if not isinstance(logical_types, dict):
        raise TypeError('logical_types must be a dictionary')
    cols_not_found = set(logical_types.keys()).difference(
        set(dataframe_columns))
    if cols_not_found:
        raise ColumnNotPresentError(
            'logical_types contains columns that are not present in '
            f'dataframe: {sorted(list(cols_not_found))}')
コード例 #8
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_time_index(column_names, time_index, logical_type):
    if time_index not in column_names:
        raise ColumnNotPresentError(
            f"Specified time index column `{time_index}` not found in TableSchema"
        )
    ltype_class = _get_ltype_class(logical_type)

    if not (ltype_class == ww.logical_types.Datetime
            or "numeric" in ltype_class.standard_tags):
        raise TypeError(
            "Time index column must be a Datetime or numeric column.")
コード例 #9
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_semantic_tags(column_names, semantic_tags):
    if not isinstance(semantic_tags, dict):
        raise TypeError("semantic_tags must be a dictionary")
    cols_not_found = set(semantic_tags.keys()).difference(set(column_names))
    if cols_not_found:
        raise ColumnNotPresentError(
            "semantic_tags contains columns that are not present in "
            f"TableSchema: {sorted(list(cols_not_found))}")

    for col_name, col_tags in semantic_tags.items():
        if not isinstance(col_tags, (str, list, set)):
            raise TypeError(
                f"semantic_tags for {col_name} must be a string, set or list")
コード例 #10
0
def _check_time_index(dataframe,
                      time_index,
                      datetime_format=None,
                      logical_type=None):
    if time_index not in dataframe.columns:
        raise ColumnNotPresentError(
            f'Specified time index column `{time_index}` not found in dataframe'
        )
    if not (_is_numeric_series(dataframe[time_index], logical_type)
            or col_is_datetime(dataframe[time_index],
                               datetime_format=datetime_format)):
        raise TypeError(
            'Time index column must contain datetime or numeric values')
コード例 #11
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_logical_types(column_names, logical_types, require_all_cols=True):
    if not isinstance(logical_types, dict):
        raise TypeError("logical_types must be a dictionary")
    cols_in_ltypes = set(logical_types.keys())
    cols_in_schema = set(column_names)

    cols_not_found_in_schema = cols_in_ltypes.difference(cols_in_schema)
    if cols_not_found_in_schema:
        raise ColumnNotPresentError(
            "logical_types contains columns that are not present in "
            f"TableSchema: {sorted(list(cols_not_found_in_schema))}")
    cols_not_found_in_ltypes = cols_in_schema.difference(cols_in_ltypes)
    if cols_not_found_in_ltypes and require_all_cols:
        raise ColumnNotPresentError(
            f"logical_types is missing columns that are present in "
            f"TableSchema: {sorted(list(cols_not_found_in_ltypes))}")

    for col_name, logical_type in logical_types.items():
        if _get_ltype_class(
                logical_type) not in ww.type_system.registered_types:
            raise TypeError("Logical Types must be of the LogicalType class "
                            "and registered in Woodwork's type system. "
                            f"{logical_type} does not meet that criteria.")
コード例 #12
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_use_standard_tags(column_names, use_standard_tags):
    if not isinstance(use_standard_tags, (dict, bool)):
        raise TypeError("use_standard_tags must be a dictionary or a boolean")
    if isinstance(use_standard_tags, dict):
        cols_not_found = set(use_standard_tags.keys()).difference(
            set(column_names))
        if cols_not_found:
            raise ColumnNotPresentError(
                "use_standard_tags contains columns that are not present in "
                f"TableSchema: {sorted(list(cols_not_found))}")

        for col_name, use_standard_tags_for_col in use_standard_tags.items():
            if not isinstance(use_standard_tags_for_col, bool):
                raise TypeError(
                    f"use_standard_tags for column {col_name} must be a boolean"
                )
コード例 #13
0
def _check_index(dataframe, index, make_index=False):
    if not make_index and index not in dataframe.columns:
        # User specifies an index that is not in the dataframe, without setting make_index to True
        raise ColumnNotPresentError(
            f'Specified index column `{index}` not found in dataframe. '
            'To create a new index column, set make_index to True.')
    if index is not None and not make_index and isinstance(
            dataframe, pd.DataFrame) and not dataframe[index].is_unique:
        # User specifies an index that is in the dataframe but not unique
        # Does not check for Dask as Dask does not support is_unique
        raise IndexError('Index column must be unique')
    if make_index and index is not None and index in dataframe.columns:
        # User sets make_index to True, but supplies an index name that matches a column already present
        raise IndexError(
            'When setting make_index to True, '
            'the name specified for index cannot match an existing column name'
        )
    if make_index and index is None:
        # User sets make_index to True, but does not supply a name for the index
        raise IndexError(
            'When setting make_index to True, '
            'the name for the new index must be specified in the index parameter'
        )
コード例 #14
0
    def pop(self, column_name):
        """Return a Series with Woodwork typing information and remove it from the DataFrame.

        Args:
            column (str): Name of the column to pop.

        Returns:
            Series: Popped series with Woodwork initialized
        """
        if self._schema is None:
            _raise_init_error()
        if column_name not in self._dataframe.columns:
            raise ColumnNotPresentError(column_name)

        series = self._dataframe.pop(column_name)

        # Initialize Woodwork typing info for series
        series.ww.init(schema=self.schema.columns[column_name], validate=False)

        # Update schema to not include popped column
        del self._schema.columns[column_name]

        return series
コード例 #15
0
ファイル: table_schema.py プロジェクト: alteryx/woodwork
def _check_index(column_names, index):
    if index not in column_names:
        # User specifies an index that is not in the list of column names
        raise ColumnNotPresentError(
            f"Specified index column `{index}` not found in TableSchema.")