def __getitem__(self, key): if self._schema is None: _raise_init_error() if isinstance(key, list): columns = set(self._dataframe.columns) diff = list(set(key).difference(columns)) if diff: raise ColumnNotPresentError(sorted(diff)) return self._get_subset_df_with_schema(key, use_dataframe_order=False) if key not in self._dataframe: raise ColumnNotPresentError(key) series = self._dataframe[key] column = copy.deepcopy(self._schema.columns[key]) column.semantic_tags -= {'index', 'time_index'} if column.use_standard_tags: column.semantic_tags |= column.logical_type.standard_tags series.ww.init(schema=column, validate=False) return series
def rename(self, columns): """Renames columns in a TableSchema Args: columns (dict[str -> str]): A dictionary mapping current column names to new column names. Returns: woodwork.TableSchema: TableSchema with the specified columns renamed. """ if not isinstance(columns, dict): raise TypeError("columns must be a dictionary") for old_name, new_name in columns.items(): if old_name not in self.columns: raise ColumnNotPresentError( f"Column to rename must be present. {old_name} cannot be found." ) if new_name in self.columns and new_name not in columns.keys(): raise ValueError( f"The column {new_name} is already present. Please choose another name to rename {old_name} to or also rename {old_name}." ) if len(columns) != len(set(columns.values())): raise ValueError( "New columns names must be unique from one another.") new_schema = copy.deepcopy(self) cols_to_update = {} for old_name, new_name in columns.items(): col = new_schema.columns.pop(old_name) cols_to_update[new_name] = col new_schema.columns.update(cols_to_update) return new_schema
def reset_semantic_tags(self, columns=None, retain_index_tags=False): """Reset the semantic tags for the specified columns to the default values. The default values will be either an empty set or a set of the standard tags based on the column logical type, controlled by the use_standard_tags property on the table. Column names can be provided as a single string, a list of strings or a set of strings. If columns is not specified, tags will be reset for all columns. Args: columns (str/list/set, optional): The columns for which the semantic tags should be reset. retain_index_tags (bool, optional): If True, will retain any index or time_index semantic tags set on the column. If False, will clear all semantic tags. Defaults to False. """ columns = _convert_input_to_set(columns, "columns") cols_not_found = sorted( list(columns.difference(set(self.columns.keys())))) if cols_not_found: raise ColumnNotPresentError(cols_not_found) if not columns: columns = self.columns.keys() for col_name in columns: original_tags = self.semantic_tags[col_name] self.columns[col_name]._reset_semantic_tags() if retain_index_tags and "index" in original_tags: self._set_index_tags(col_name) if retain_index_tags and "time_index" in original_tags: self._set_time_index_tags(col_name)
def drop(self, columns): """Drop specified columns from a DataFrame. Args: columns (str or list[str]): Column name or names to drop. Must be present in the DataFrame. Returns: DataFrame: DataFrame with the specified columns removed, maintaining Woodwork typing information. Note: This method is used for removing columns only. To remove rows with ``drop``, go through the DataFrame directly and then reinitialize Woodwork with ``DataFrame.ww.init`` instead of calling ``DataFrame.ww.drop``. """ if self._schema is None: _raise_init_error() if not isinstance(columns, (list, set)): columns = [columns] not_present = [ col for col in columns if col not in self._dataframe.columns ] if not_present: raise ColumnNotPresentError(not_present) return self._get_subset_df_with_schema( [col for col in self._dataframe.columns if col not in columns])
def _check_column_metadata(column_names, column_metadata): if not isinstance(column_metadata, dict): raise TypeError("Column metadata must be a dictionary.") cols_not_found = set(column_metadata.keys()).difference(set(column_names)) if cols_not_found: raise ColumnNotPresentError( "column_metadata contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found))}")
def _check_column_descriptions(column_names, column_descriptions): if not isinstance(column_descriptions, dict): raise TypeError("column_descriptions must be a dictionary") cols_not_found = set(column_descriptions.keys()).difference( set(column_names)) if cols_not_found: raise ColumnNotPresentError( "column_descriptions contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found))}")
def _check_logical_types(dataframe_columns, logical_types): if not isinstance(logical_types, dict): raise TypeError('logical_types must be a dictionary') cols_not_found = set(logical_types.keys()).difference( set(dataframe_columns)) if cols_not_found: raise ColumnNotPresentError( 'logical_types contains columns that are not present in ' f'dataframe: {sorted(list(cols_not_found))}')
def _check_time_index(column_names, time_index, logical_type): if time_index not in column_names: raise ColumnNotPresentError( f"Specified time index column `{time_index}` not found in TableSchema" ) ltype_class = _get_ltype_class(logical_type) if not (ltype_class == ww.logical_types.Datetime or "numeric" in ltype_class.standard_tags): raise TypeError( "Time index column must be a Datetime or numeric column.")
def _check_semantic_tags(column_names, semantic_tags): if not isinstance(semantic_tags, dict): raise TypeError("semantic_tags must be a dictionary") cols_not_found = set(semantic_tags.keys()).difference(set(column_names)) if cols_not_found: raise ColumnNotPresentError( "semantic_tags contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found))}") for col_name, col_tags in semantic_tags.items(): if not isinstance(col_tags, (str, list, set)): raise TypeError( f"semantic_tags for {col_name} must be a string, set or list")
def _check_time_index(dataframe, time_index, datetime_format=None, logical_type=None): if time_index not in dataframe.columns: raise ColumnNotPresentError( f'Specified time index column `{time_index}` not found in dataframe' ) if not (_is_numeric_series(dataframe[time_index], logical_type) or col_is_datetime(dataframe[time_index], datetime_format=datetime_format)): raise TypeError( 'Time index column must contain datetime or numeric values')
def _check_logical_types(column_names, logical_types, require_all_cols=True): if not isinstance(logical_types, dict): raise TypeError("logical_types must be a dictionary") cols_in_ltypes = set(logical_types.keys()) cols_in_schema = set(column_names) cols_not_found_in_schema = cols_in_ltypes.difference(cols_in_schema) if cols_not_found_in_schema: raise ColumnNotPresentError( "logical_types contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found_in_schema))}") cols_not_found_in_ltypes = cols_in_schema.difference(cols_in_ltypes) if cols_not_found_in_ltypes and require_all_cols: raise ColumnNotPresentError( f"logical_types is missing columns that are present in " f"TableSchema: {sorted(list(cols_not_found_in_ltypes))}") for col_name, logical_type in logical_types.items(): if _get_ltype_class( logical_type) not in ww.type_system.registered_types: raise TypeError("Logical Types must be of the LogicalType class " "and registered in Woodwork's type system. " f"{logical_type} does not meet that criteria.")
def _check_use_standard_tags(column_names, use_standard_tags): if not isinstance(use_standard_tags, (dict, bool)): raise TypeError("use_standard_tags must be a dictionary or a boolean") if isinstance(use_standard_tags, dict): cols_not_found = set(use_standard_tags.keys()).difference( set(column_names)) if cols_not_found: raise ColumnNotPresentError( "use_standard_tags contains columns that are not present in " f"TableSchema: {sorted(list(cols_not_found))}") for col_name, use_standard_tags_for_col in use_standard_tags.items(): if not isinstance(use_standard_tags_for_col, bool): raise TypeError( f"use_standard_tags for column {col_name} must be a boolean" )
def _check_index(dataframe, index, make_index=False): if not make_index and index not in dataframe.columns: # User specifies an index that is not in the dataframe, without setting make_index to True raise ColumnNotPresentError( f'Specified index column `{index}` not found in dataframe. ' 'To create a new index column, set make_index to True.') if index is not None and not make_index and isinstance( dataframe, pd.DataFrame) and not dataframe[index].is_unique: # User specifies an index that is in the dataframe but not unique # Does not check for Dask as Dask does not support is_unique raise IndexError('Index column must be unique') if make_index and index is not None and index in dataframe.columns: # User sets make_index to True, but supplies an index name that matches a column already present raise IndexError( 'When setting make_index to True, ' 'the name specified for index cannot match an existing column name' ) if make_index and index is None: # User sets make_index to True, but does not supply a name for the index raise IndexError( 'When setting make_index to True, ' 'the name for the new index must be specified in the index parameter' )
def pop(self, column_name): """Return a Series with Woodwork typing information and remove it from the DataFrame. Args: column (str): Name of the column to pop. Returns: Series: Popped series with Woodwork initialized """ if self._schema is None: _raise_init_error() if column_name not in self._dataframe.columns: raise ColumnNotPresentError(column_name) series = self._dataframe.pop(column_name) # Initialize Woodwork typing info for series series.ww.init(schema=self.schema.columns[column_name], validate=False) # Update schema to not include popped column del self._schema.columns[column_name] return series
def _check_index(column_names, index): if index not in column_names: # User specifies an index that is not in the list of column names raise ColumnNotPresentError( f"Specified index column `{index}` not found in TableSchema.")