def test_get_column_logical_type(sample_series): assert isinstance( _get_column_logical_type(sample_series, None, "col_name"), Categorical ) assert isinstance( _get_column_logical_type(sample_series, Datetime, "col_name"), Datetime )
def init(self, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None): """Initializes Woodwork typing information for a Series. Args: logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. If the LogicalType provided or inferred does not have a dtype that is compatible with the series dtype, an error will be raised. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. """ logical_type = _get_column_logical_type(self._series, logical_type, self._series.name) self._validate_logical_type(logical_type) self._schema = ColumnSchema(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata)
def init_series( series, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, origin=None, metadata=None, ): """Initializes Woodwork typing information for a series, numpy.ndarray or pd.api.extensions. ExtensionArray, returning a new Series. The dtype of the returned series will be converted to match the dtype associated with the LogicalType. Args: series (pd.Series, dd.Series, ps.Series, numpy.ndarray or pd.api.extensions.ExtensionArray): The original series from which to create the Woodwork initialized series. logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. origin (str, optional): Optional text specifying origin of the column (i.e. "base" or "engineered"). metadata (dict[str -> json serializable], optional): Metadata associated with the series. Returns: Series: A series with Woodwork typing information initialized """ if not _is_series(series): if (isinstance(series, (np.ndarray, pd.api.extensions.ExtensionArray)) and series.ndim == 1): series = pd.Series(series) elif isinstance(series, np.ndarray) and series.ndim != 1: raise ValueError( f"np.ndarray input must be 1 dimensional. Current np.ndarray is {series.ndim} dimensional" ) else: raise TypeError( f"Input must be of series type. The current input is of type {type(series)}" ) logical_type = _get_column_logical_type(series, logical_type, series.name) new_series = logical_type.transform(series) new_series.ww.init( logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, origin=origin, metadata=metadata, ) return new_series
def init_series(series, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None): """Initializes Woodwork typing information for a Series, returning a new Series. The dtype of the returned series will be converted to match the dtype associated with the LogicalType. Args: series (pd.Series, dd.Series, or ks.Series): The original series from which to create the Woodwork initialized series. logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. Returns: Series: A series with Woodwork typing information initialized """ logical_type = _get_column_logical_type(series, logical_type, series.name) new_series = _update_column_dtype(series, logical_type) new_series.ww.init(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata) return new_series
def init(self, index=None, time_index=None, logical_types=None, make_index=False, already_sorted=False, schema=None, validate=True, use_standard_tags=True, **kwargs): """Initializes Woodwork typing information for a DataFrame. Args: index (str, optional): Name of the index column. time_index (str, optional): Name of the time index column. logical_types (dict[str -> LogicalType]): Dictionary mapping column names in the DataFrame to the LogicalType for the column. make_index (bool, optional): If True, will create a new unique, numeric index column with the name specified by ``index`` and will add the new index column to the supplied DataFrame. If True, the name specified in ``index`` cannot match an existing column name in ``dataframe``. If False, the name is specified in ``index`` must match a column present in the ``dataframe``. Defaults to False. already_sorted (bool, optional): Indicates whether the input DataFrame is already sorted on the time index. If False, will sort the dataframe first on the time_index and then on the index (pandas DataFrame only). Defaults to False. name (str, optional): Name used to identify the DataFrame. semantic_tags (dict, optional): Dictionary mapping column names in Woodwork to the semantic tags for the column. The keys in the dictionary should be strings that correspond to column names. There are two options for specifying the dictionary values: (str): If only one semantic tag is being set, a single string can be used as a value. (list[str] or set[str]): If multiple tags are being set, a list or set of strings can be used as the value. Semantic tags will be set to an empty set for any column not included in the dictionary. table_metadata (dict[str -> json serializable], optional): Dictionary containing extra metadata for Woodwork. column_metadata (dict[str -> dict[str -> json serializable]], optional): Dictionary mapping column names to that column's metadata dictionary. use_standard_tags (bool, dict[str -> bool], optional): Determines whether standard semantic tags will be added to columns based on the specified logical type for the column. If a single boolean is supplied, will apply the same use_standard_tags value to all columns. A dictionary can be used to specify ``use_standard_tags`` values for individual columns. Unspecified columns will use the default value. Defaults to True. column_descriptions (dict[str -> str], optional): Dictionary mapping column names to column descriptions. schema (Woodwork.TableSchema, optional): Typing information to use for the DataFrame instead of performing inference. Any other arguments provided will be ignored. Note that any changes made to the schema object after initialization will propagate to the DataFrame. Similarly, to avoid unintended typing information changes, the same schema object should not be shared between DataFrames. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ if validate: _validate_accessor_params(self._dataframe, index, make_index, time_index, logical_types, schema, use_standard_tags) if schema is not None: self._schema = schema extra_params = [] if index is not None: extra_params.append('index') if make_index: extra_params.append('make_index') if time_index is not None: extra_params.append('time_index') if logical_types is not None: extra_params.append('logical_types') if already_sorted: extra_params.append('already_sorted') if not use_standard_tags or isinstance(use_standard_tags, dict): extra_params.append('use_standard_tags') for key in kwargs: extra_params.append(key) if extra_params: warnings.warn( "A schema was provided and the following parameters were ignored: " + ", ".join(extra_params), ParametersIgnoredWarning) # We need to store make_index on the Accessor when initializing with a schema # but we still should ignore any make_index value passed in here self.make_index = False else: self.make_index = make_index if make_index: _make_index(self._dataframe, index) # Perform type inference and update underlying data parsed_logical_types = {} for name in self._dataframe.columns: series = self._dataframe[name] logical_type = None if logical_types: logical_type = logical_types.get(name) logical_type = _get_column_logical_type( series, logical_type, name) parsed_logical_types[name] = logical_type updated_series = _update_column_dtype(series, logical_type) if updated_series is not series: self._dataframe[name] = updated_series column_names = list(self._dataframe.columns) # TableSchema uses a different default for use_standard_tags so we need to define it here if isinstance(use_standard_tags, bool): use_standard_tags = { col_name: use_standard_tags for col_name in column_names } else: use_standard_tags = { **{col_name: True for col_name in column_names}, **use_standard_tags } self._schema = TableSchema(column_names=column_names, logical_types=parsed_logical_types, index=index, time_index=time_index, validate=validate, use_standard_tags=use_standard_tags, **kwargs) self._set_underlying_index() if self._schema.time_index is not None: self._sort_columns(already_sorted)
def test_get_column_logical_type(sample_series): assert _get_column_logical_type(sample_series, None, 'col_name') == Categorical assert _get_column_logical_type(sample_series, Datetime, 'col_name') == Datetime
def init(self, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, metadata=None, schema=None, validate=True): """Initializes Woodwork typing information for a Series. Args: logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. If the LogicalType provided or inferred does not have a dtype that is compatible with the series dtype, an error will be raised. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. metadata (dict[str -> json serializable], optional): Metadata associated with the series. schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference. Any other arguments provided will be ignored. Note that any changes made to the schema object after initialization will propagate to the Series. Similarly, to avoid unintended typing information changes, the same schema object should not be shared between Series. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ if schema is not None: if validate: _validate_schema(schema, self._series) extra_params = [] if logical_type is not None: extra_params.append('logical_type') if semantic_tags is not None: extra_params.append('semantic_tags') if description is not None: extra_params.append('description') if metadata is not None: extra_params.append('metadata') if not use_standard_tags: extra_params.append('use_standard_tags') if extra_params: warnings.warn( "A schema was provided and the following parameters were ignored: " + ", ".join(extra_params), ParametersIgnoredWarning) self._schema = schema else: logical_type = _get_column_logical_type(self._series, logical_type, self._series.name) if validate: self._validate_logical_type(logical_type) self._schema = ColumnSchema(logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, metadata=metadata, validate=validate)
def init( self, logical_type=None, semantic_tags=None, use_standard_tags=True, description=None, origin=None, metadata=None, schema=None, validate=True, ): """Initializes Woodwork typing information for a Series. Args: logical_type (LogicalType or str, optional): The logical type that should be assigned to the series. If no value is provided, the LogicalType for the series will be inferred. If the LogicalType provided or inferred does not have a dtype that is compatible with the series dtype, an error will be raised. semantic_tags (str or list or set, optional): Semantic tags to assign to the series. Defaults to an empty set if not specified. There are two options for specifying the semantic tags: (str) If only one semantic tag is being set, a single string can be passed. (list or set) If multiple tags are being set, a list or set of strings can be passed. use_standard_tags (bool, optional): If True, will add standard semantic tags to the series based on the inferred or specified logical type of the series. Defaults to True. description (str, optional): Optional text describing the contents of the series. origin (str, optional): Optional text specifying origin of the column (i.e. "base" or "engineered"). metadata (dict[str -> json serializable], optional): Metadata associated with the series. schema (Woodwork.ColumnSchema, optional): Typing information to use for the Series instead of performing inference. Any other arguments provided will be ignored. Note that any changes made to the schema object after initialization will propagate to the Series. Similarly, to avoid unintended typing information changes, the same schema object should not be shared between Series. validate (bool, optional): Whether parameter and data validation should occur. Defaults to True. Warning: Should be set to False only when parameters and data are known to be valid. Any errors resulting from skipping validation with invalid inputs may not be easily understood. """ if schema is not None: if validate: _validate_schema(schema, self._series) extra_params = [] if logical_type is not None: extra_params.append("logical_type") if semantic_tags is not None: extra_params.append("semantic_tags") if description is not None: extra_params.append("description") if origin is not None: extra_params.append("origin") if metadata is not None: extra_params.append("metadata") if not use_standard_tags: extra_params.append("use_standard_tags") if extra_params: warnings.warn( "A schema was provided and the following parameters were ignored: " + ", ".join(extra_params), ParametersIgnoredWarning, ) self._schema = schema else: logical_type = _get_column_logical_type(self._series, logical_type, self._series.name) if validate: if isinstance(logical_type, (Ordinal, LatLong)): logical_type.validate(self._series) else: valid_dtype = logical_type._get_valid_dtype( type(self._series)) if not _check_data_type_equality(valid_dtype, str(self._series.dtype)): raise TypeValidationError( f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is " f"incompatible with {logical_type} dtype. Try converting series " f"dtype to '{valid_dtype}' before initializing or use the " "woodwork.init_series function to initialize.") self._schema = ColumnSchema( logical_type=logical_type, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, description=description, origin=origin, metadata=metadata, validate=validate, )