def test_encode_util(values, expected): uniques = _encode(values) assert_array_equal(uniques, expected) uniques, encoded = _encode(values, encode=True) assert_array_equal(uniques, expected) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) _, encoded = _encode(values, uniques, encode=True) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
def test_encode_util(values, expected): uniques = _encode(values) assert_array_equal(uniques, expected) uniques, encoded = _encode(values, encode=True) assert_array_equal(uniques, expected) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2])) _, encoded = _encode(values, uniques, encode=True) assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
def transform(self, y): """Transform labels to normalized encoding. If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values. Seen labels are encoded with value between 0 and n_classes-1. Unseen labels are encoded with ``self.fill_encoded_label_value`` with a default value of n_classes. Parameters ---------- y : array-like of shape [n_samples] Label values. Returns ------- y_encoded : array-like of shape [n_samples] Encoded label values. """ check_is_fitted(self, "classes_") y = column_or_1d(y, warn=True) # transform of empty array is empty array if _num_samples(y) == 0: return np.array([]) if self.fill_unseen_labels: _, mask = _encode_check_unknown(y, self.classes_, return_mask=True) y_encoded = np.searchsorted(self.classes_, y) fill_encoded_label_value = self.fill_encoded_label_value or len( self.classes_) y_encoded[~mask] = fill_encoded_label_value else: _, y_encoded = _encode(y, uniques=self.classes_, encode=True) return y_encoded
def _transform(self, X, handle_unknown='error'): X = self._check_X(X) _, n_features = X.shape X_int = np.zeros_like(X, dtype=np.int) X_mask = np.ones_like(X, dtype=np.bool) for i in range(n_features): Xi = X[:, i] diff, valid_mask = _encode_check_unknown(Xi, self.categories_[i], return_mask=True) if not np.all(valid_mask): if handle_unknown == 'error': msg = ("Found unknown categories {0} in column {1}" " during transform".format(diff, i)) raise ValueError(msg) else: # Set the problematic rows to an acceptable value and # continue `The rows are marked `X_mask` and will be # removed later. X_mask[:, i] = valid_mask Xi = Xi.copy() Xi[~valid_mask] = self.categories_[i][0] _, encoded = _encode(Xi, self.categories_[i], encode=True) X_int[:, i] = encoded return X_int, X_mask
def _fit(self, X, handle_unknown='error'): X = self._check_X(X) n_samples, n_features = X.shape if self._categories != 'auto': if X.dtype != object: for cats in self._categories: if not np.all(np.sort(cats) == np.array(cats)): raise ValueError("Unsorted categories are not " "supported for numerical categories") if len(self._categories) != n_features: raise ValueError("Shape mismatch: if n_values is an array," " it has to be of shape (n_features,).") self.categories_ = [] for i in range(n_features): Xi = X[:, i] if self._categories == 'auto': cats = _encode(Xi) else: cats = np.array(self._categories[i], dtype=X.dtype) if handle_unknown == 'error': diff = _encode_check_unknown(Xi, cats) if diff: msg = ("Found unknown categories {0} in column {1}" " during fit".format(diff, i)) raise ValueError(msg) self.categories_.append(cats)
def fit_transform(self, y): """Fit label encoder and return encoded labels. ``fill_unseen_labels=True`` does nothing in ``fit_transform`` because there will be no unseen labels. Parameters ---------- y : array-like of shape [n_samples] Label values. Returns ------- y_encoded : array-like of shape [n_samples] Encoded label values. """ y = column_or_1d(y, warn=True) sorted_labels = self._check_labels_and_sort() self.classes_, y_encoded = (_encode( y, uniques=sorted_labels, encode=True) if sorted_labels else _encode(y, encode=True)) return y_encoded
def fit(self, y): """Fit label encoder. Parameters ---------- y : array-like of shape (n_samples,) Label values. Returns ------- self : RobustLabelEncoder. """ y = column_or_1d(y, warn=True) self.classes_ = self._check_labels_and_sort() or _encode(y) return self
def test_encode_check_unknown(): # test for the check_unknown parameter of _encode() uniques = np.array([1, 2, 3]) values = np.array([1, 2, 3, 4]) # Default is True, raise error with pytest.raises(ValueError, match='y contains previously unseen labels'): _encode(values, uniques, encode=True, check_unknown=True) # dont raise error if False _encode(values, uniques, encode=True, check_unknown=False) # parameter is ignored for object dtype uniques = np.array(['a', 'b', 'c'], dtype=object) values = np.array(['a', 'b', 'c', 'd'], dtype=object) with pytest.raises(ValueError, match='y contains previously unseen labels'): _encode(values, uniques, encode=True, check_unknown=False)
def test_encode_check_unknown(): # test for the check_unknown parameter of _encode() uniques = np.array([1, 2, 3]) values = np.array([1, 2, 3, 4]) # Default is True, raise error with pytest.raises(ValueError, match='y contains previously unseen labels'): _encode(values, uniques, encode=True, check_unknown=True) # dont raise error if False _encode(values, uniques, encode=True, check_unknown=False) # parameter is ignored for object dtype uniques = np.array(['a', 'b', 'c'], dtype=object) values = np.array(['a', 'b', 'c', 'd'], dtype=object) with pytest.raises(ValueError, match='y contains previously unseen labels'): _encode(values, uniques, encode=True, check_unknown=False)
# ## Using LSTMs # In[891]: max_features = 20000 # cut texts after this number of words # (among top max_features most common words) maxlen = 100 batch_size = 32 print('Loading data...') # Encoding Training Labels y_train = column_or_1d(y_train, warn=True) classes_, encoded_values = _encode(y_train, uniques=np.array([ 'half-true', 'mostly-true', 'false', 'true', 'barely-true', 'pants-fire' ]), encode=True) encoded_values, classes_ y_train = encoded_values # Encoding Testing Labels y_test = column_or_1d(y_test, warn=True) classes_, encoded_values = _encode(y_test, uniques=np.array([ 'half-true', 'mostly-true', 'false', 'true', 'barely-true', 'pants-fire' ]), encode=True) encoded_values, classes_ y_test = encoded_values