def __init__(self, dataset: np.ndarray, ground_truth: Optional[np.ndarray] = None, categorical_indices: Optional[np.ndarray] = None, int_to_float: bool = True) -> None: """ Constructs an ``Augmentation`` abstract class. """ # pylint: disable=too-many-locals assert _validate_input(dataset, ground_truth=ground_truth, categorical_indices=categorical_indices, int_to_float=int_to_float), 'Invalid input.' self.dataset = dataset self.data_points_number = dataset.shape[0] self.is_structured = fuav.is_structured_array(dataset) self.ground_truth = ground_truth # Sort out column indices indices = fuat.indices_by_type(dataset) num_indices = set(indices[0]) cat_indices = set(indices[1]) all_indices = num_indices.union(cat_indices) if categorical_indices is None: categorical_indices = cat_indices numerical_indices = num_indices else: if cat_indices.difference(categorical_indices): msg = ('Some of the string-based columns in the input dataset ' 'were not selected as categorical features via the ' 'categorical_indices parameter. String-based columns ' 'cannot be treated as numerical features, therefore ' 'they will be also treated as categorical features ' '(in addition to the ones selected with the ' 'categorical_indices parameter).') warnings.warn(msg, UserWarning) categorical_indices = cat_indices.union(categorical_indices) numerical_indices = all_indices.difference(categorical_indices) self.categorical_indices = sorted(list(categorical_indices)) self.numerical_indices = sorted(list(numerical_indices)) self.features_number = len(all_indices) # Sort out the dtype of the sampled array. ntype = np.dtype(np.float64) if int_to_float else np.dtype(np.int64) if self.is_structured: sample_dtype = [] for column_name in self.dataset.dtype.names: if column_name in self.numerical_indices: new_dtype = fuat.generalise_dtype( self.dataset.dtype[column_name], ntype) sample_dtype.append((column_name, new_dtype)) elif column_name in self.categorical_indices: sample_dtype.append( (column_name, self.dataset.dtype[column_name])) else: assert False, 'Unknown column name.' # pragma: nocover else: if fuav.is_numerical_array(self.dataset): sample_dtype = fuat.generalise_dtype(self.dataset.dtype, ntype) else: sample_dtype = self.dataset.dtype self.sample_dtype = sample_dtype
def _interpolate_array( dataset: np.ndarray, feature_index: Union[int, str], # yapf: disable treat_as_categorical: bool, steps_number: Union[int, None]) -> Tuple[np.ndarray, np.ndarray]: """ Generates a 3-D array with interpolated values for the selected feature. If the selected feature is numerical the interpolated values are a numerical array with evenly spaced numbers between the minimum and the maximum value in that column. Otherwise, when the feature is categorical the interpolated values are all the unique elements of the that column. To get the interpolation the original 2-D dataset is stacked on top of itself the number of times equal to the number of desired interpolation samples. Then, for every copy of that dataset the selected feature is fixed to consecutive values of the interpolated array (the same value for the whole copy of the dataset). Parameters ---------- dataset : numpy.ndarray A dataset based on which interpolation will be done. feature_index : Union[integer, string] An index of the feature column in the input dataset for which the interpolation will be computed. treat_as_categorical : boolean Whether to treat the selected feature as categorical or numerical. steps_number : Union[integer, None] The number of evenly spaced samples between the minimum and the maximum value of the selected feature for which the model's prediction will be evaluated. This parameter applies only to numerical features, for categorical features regardless whether it is a number or ``None``, it will be ignored. Returns ------- interpolated_data : numpy.ndarray Numpy array of shape (n_samples, steps_number, n_features) -- where the (n_samples, n_features) is the dimension of the input ``dataset`` -- holding the input ``dataset`` augmented with the interpolated values. interpolated_values : numpy.ndarray A 1-dimensional array of shape (steps_number, ) holding the interpolated values. If a numerical column is selected this will be a series of uniformly distributed ``steps_number`` values between the minimum and the maximum value of that column. For categorical (textual) columns it will hold all the unique values from that column. """ assert isinstance(dataset, np.ndarray), 'Dataset -> numpy array.' assert isinstance(feature_index, (int, str)), 'Feature index -> str/ int.' assert isinstance(treat_as_categorical, bool), 'As categorical -> bool.' assert steps_number is None or isinstance(steps_number, int), \ 'Steps number -> None/ int.' is_structured = fuav.is_structured_array(dataset) if is_structured: column = dataset[feature_index] else: column = dataset[:, feature_index] if treat_as_categorical: interpolated_values = np.unique(column) interpolated_values.sort() # Ignoring steps number -- not needed for categorical. steps_number = interpolated_values.shape[0] else: assert isinstance(steps_number, int), 'Steps number must be an int.' interpolated_values = np.linspace(column.min(), column.max(), steps_number) # Give float type to this column if it is a structured array if (is_structured and dataset.dtype[feature_index] != interpolated_values.dtype): new_types = [] for name in dataset.dtype.names: if name == feature_index: dtype = fuat.generalise_dtype(interpolated_values.dtype, dataset.dtype[name]) new_types.append((name, dtype)) else: new_types.append((name, dataset.dtype[name])) dataset = dataset.astype(new_types) elif not is_structured and dataset.dtype != interpolated_values.dtype: dtype = fuat.generalise_dtype(interpolated_values.dtype, dataset.dtype) dataset = dataset.astype(dtype) interpolated_data = np.repeat(dataset[:, np.newaxis], steps_number, axis=1) assert len(interpolated_values) == steps_number, 'Required for broadcast.' if is_structured: for idx in range(steps_number): # Broadcast the new value. interpolated_data[:, idx][feature_index] = interpolated_values[idx] else: # Broadcast the new vector. interpolated_data[:, :, feature_index] = interpolated_values return interpolated_data, interpolated_values
def test_generalise_dtype(): """ Tests :func:`fatf.utils.array.tools.generalise_dtype`. """ error_msg = 'The {} dtype is not one of the base types (strings/numbers).' with pytest.raises(ValueError) as exin: fuat.generalise_dtype(np.dtype(np.datetime64), np.dtype(np.datetime64)) assert str(exin.value) == error_msg.format('first') with pytest.raises(ValueError) as exin: fuat.generalise_dtype(np.dtype(np.float64), np.dtype(np.datetime64)) assert str(exin.value) == error_msg.format('second') dtype_int = np.dtype(int) dtype_int32 = np.dtype(np.int32) dtype_int64 = np.dtype(np.int64) dtype_float = np.dtype(float) dtype_float16 = np.dtype(np.float16) dtype_float32 = np.dtype(np.float32) dtype_float64 = np.dtype(np.float64) dtype_str = np.dtype(str) dtype_str4 = np.dtype('U4') dtype_str11 = np.dtype('U11') dtype_str16 = np.dtype('U16') dtype_str21 = np.dtype('U21') dtype_str32 = np.dtype('U32') assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int32) assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int64) assert dtype_int64 is fuat.generalise_dtype(dtype_int32, dtype_int64) assert dtype_int64 is fuat.generalise_dtype(dtype_int, dtype_int) assert dtype_float64 is fuat.generalise_dtype(dtype_float, dtype_float) assert dtype_float64 is fuat.generalise_dtype(dtype_float64, dtype_float) assert dtype_float64 is fuat.generalise_dtype(dtype_int, dtype_float32) assert dtype_float64 is fuat.generalise_dtype(dtype_int32, dtype_float32) assert dtype_float32 is fuat.generalise_dtype(dtype_float32, dtype_float16) assert dtype_str4 is fuat.generalise_dtype(dtype_str, dtype_str4) assert dtype_str21 is fuat.generalise_dtype(dtype_str21, dtype_str4) assert dtype_str16 == fuat.generalise_dtype(dtype_str11, dtype_str16) assert dtype_str11 == fuat.generalise_dtype(dtype_int32, dtype_str4) assert dtype_str21 == fuat.generalise_dtype(dtype_int64, dtype_str4) assert dtype_str32 == fuat.generalise_dtype(dtype_float32, dtype_str4) assert dtype_str32 == fuat.generalise_dtype(dtype_float64, dtype_str16)