def test_get_valid_dtype(sample_series): valid_dtype = _get_valid_dtype(type(sample_series), Categorical) if ks and isinstance(sample_series, ks.Series): assert valid_dtype == 'string' else: assert valid_dtype == 'category' valid_dtype = _get_valid_dtype(type(sample_series), Boolean) assert valid_dtype == 'bool'
def wrapper(*args, **kwargs): # Make Series call and intercept the result result = series_attr(*args, **kwargs) # Try to initialize Woodwork with the existing schema if _is_series(result): valid_dtype = _get_valid_dtype(type(result), self._schema.logical_type) if str(result.dtype) == valid_dtype: result.ww.init( logical_type=self._schema.logical_type, semantic_tags=copy.deepcopy( self._schema.semantic_tags), description=self._schema.description, metadata=copy.deepcopy(self._schema.metadata), use_standard_tags=self._schema.use_standard_tags) else: invalid_schema_message = 'dtype mismatch between original dtype, ' \ f'{valid_dtype}, and returned dtype, {result.dtype}' warning_message = TypingInfoMismatchWarning( ).get_warning_message(attr, invalid_schema_message, 'Series') warnings.warn(warning_message, TypingInfoMismatchWarning) # Always return the results of the Series operation whether or not Woodwork is initialized return result
def _get_invalid_schema_message(dataframe, schema): dataframe_cols = set(dataframe.columns) schema_cols = set(schema.columns.keys()) df_cols_not_in_schema = dataframe_cols - schema_cols if df_cols_not_in_schema: return f'The following columns in the DataFrame were missing from the typing information: '\ f'{df_cols_not_in_schema}' schema_cols_not_in_df = schema_cols - dataframe_cols if schema_cols_not_in_df: return f'The following columns in the typing information were missing from the DataFrame: '\ f'{schema_cols_not_in_df}' for name in dataframe.columns: df_dtype = dataframe[name].dtype valid_dtype = _get_valid_dtype(type(dataframe[name]), schema.logical_types[name]) if str(df_dtype) != valid_dtype: return f'dtype mismatch for column {name} between DataFrame dtype, '\ f'{df_dtype}, and {schema.logical_types[name]} dtype, {valid_dtype}' if schema.index is not None and isinstance(dataframe, pd.DataFrame): # Index validation not performed for Dask/Koalas if not all(dataframe.index == dataframe[schema.index]): return 'Index mismatch between DataFrame and typing information' elif not dataframe[schema.index].is_unique: return 'Index column is not unique'
def physical_types(self): """A dictionary containing physical types for each column""" if self._schema is None: _raise_init_error() return { col_name: _get_valid_dtype(type(self._dataframe[col_name]), self._schema.logical_types[col_name]) for col_name in self._dataframe.columns }
def test_init_series_valid_conversion_specified_ltype(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') series = init_series(sample_series, logical_type='categorical') assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'category'} series = init_series(sample_series, logical_type='natural_language') assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), NaturalLanguage) assert series.dtype == correct_dtype assert series.ww.logical_type == NaturalLanguage assert series.ww.semantic_tags == set()
def _validate_schema(schema, series): if not isinstance(schema, ColumnSchema): raise TypeError( 'Provided schema must be a Woodwork.ColumnSchema object.') valid_dtype = _get_valid_dtype(type(series), schema.logical_type) if str(series.dtype) != valid_dtype: raise ValueError( f"dtype mismatch between Series dtype {series.dtype}, and {schema.logical_type} dtype, {valid_dtype}" )
def test_init_series_valid_conversion_inferred_ltype(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') series = init_series(sample_series) assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'category'}
def _validate_logical_type(self, logical_type): """Validates that a logical type is consistent with the series dtype. Performs additional type specific validation, as required.""" valid_dtype = _get_valid_dtype(type(self._series), logical_type) if valid_dtype != str(self._series.dtype): raise ValueError( f"Cannot initialize Woodwork. Series dtype '{self._series.dtype}' is " f"incompatible with {logical_type} dtype. Try converting series " f"dtype to '{valid_dtype}' before initializing or use the " "woodwork.init_series function to initialize.") if isinstance(logical_type, Ordinal): logical_type._validate_data(self._series) elif logical_type == LatLong: if not _is_valid_latlong_series(self._series): raise ValueError( "Cannot initialize Woodwork. Series does not contain properly formatted " "LatLong data. Try reformatting before initializing or use the " "woodwork.init_series function to initialize.")
def wrapper(*args, **kwargs): # Make Series call and intercept the result result = series_attr(*args, **kwargs) # Try to initialize Woodwork with the existing schema if _is_series(result): valid_dtype = _get_valid_dtype(type(result), self._schema.logical_type) if str(result.dtype) == valid_dtype: result.ww.init(schema=self.schema, validate=False) else: invalid_schema_message = 'dtype mismatch between original dtype, ' \ f'{valid_dtype}, and returned dtype, {result.dtype}' warning_message = TypingInfoMismatchWarning( ).get_warning_message(attr, invalid_schema_message, 'Series') warnings.warn(warning_message, TypingInfoMismatchWarning) # Always return the results of the Series operation whether or not Woodwork is initialized return result
def test_init_series_all_parameters(sample_series): if ks and isinstance(sample_series, ks.Series): sample_series = sample_series.astype('str') else: sample_series = sample_series.astype('object') metadata = {'meta_key': 'meta_value'} description = 'custom description' series = init_series(sample_series, logical_type='categorical', semantic_tags=['custom_tag'], metadata=metadata, description=description, use_standard_tags=False) assert series is not sample_series correct_dtype = _get_valid_dtype(type(sample_series), Categorical) assert series.dtype == correct_dtype assert series.ww.logical_type == Categorical assert series.ww.semantic_tags == {'custom_tag'} assert series.ww.metadata == metadata assert series.ww.description == description