def fit_before_convert(self, dataset): """Fit the encoder.""" # If in tf.data.Dataset, must be encoded already. if isinstance(dataset, tf.data.Dataset): return # Convert the data to np.ndarray. if isinstance(dataset, pd.DataFrame): dataset = dataset.values if isinstance(dataset, pd.Series): dataset = dataset.values.reshape(-1, 1) # If encoded. # TODO: support raw string labels for multi-label. if len(dataset.flatten()) != len(dataset): if self.num_classes: self._check_data_shape(dataset.shape[1:]) return # Fit encoder. labels = set(dataset.flatten()) if len(labels) < 2: raise ValueError( 'Expect the target data for {name} to have ' 'at least 2 classes, but got {num_classes}.'.format( name=self.name, num_classes=self.num_classes)) if len(labels) == 2 and not self.multi_label: self.label_encoder = encoders.LabelEncoder() else: self.label_encoder = encoders.OneHotEncoder() self.label_encoder.fit(dataset)
def fit_before_convert(self, dataset): # If in tf.data.Dataset, must be encoded already. if isinstance(dataset, tf.data.Dataset): if not self.num_classes: shape = utils.dataset_shape(dataset)[0] # Single column with 0s and 1s. if shape == 1: self.num_classes = 2 else: self.num_classes = shape return if isinstance(dataset, pd.DataFrame): dataset = dataset.values if isinstance(dataset, pd.Series): dataset = dataset.values.reshape(-1, 1) # Not label. if len(dataset.flatten()) != len(dataset): self.num_classes = dataset.shape[1] return labels = set(dataset.flatten()) if self.num_classes is None: self.num_classes = len(labels) if self.num_classes == 2: self.label_encoder = encoders.LabelEncoder() elif self.num_classes > 2: self.label_encoder = encoders.OneHotEncoder() elif self.num_classes < 2: raise ValueError( 'Expect the target data for {name} to have ' 'at least 2 classes, but got {num_classes}.'.format( name=self.name, num_classes=self.num_classes)) self.label_encoder.fit(dataset)
def test_wrong_num_classes_error(): encoder = encoders.OneHotEncoder(num_classes=3) with pytest.raises(ValueError) as info: encoder.fit(np.array(["a", "b", "a"])) assert "Expect 3 classes in the training targets" in str(info.value)
def test_one_hot_encoder_deserialize_transforms_to_np(): encoder = encoders.OneHotEncoder() encoder.fit(np.array(["a", "b", "a"])) encoder = encoders.deserialize(encoders.serialize(encoder)) one_hot = encoder.encode(np.array(["a"])) assert np.array_equal(one_hot, [[1, 0]]) or np.array_equal(one_hot, [[0, 1]])
def test_one_hot_encoder_decode_to_same_string(): encoder = encoders.OneHotEncoder() encoder.fit(np.array(["a", "b", "a"])) assert encoder.decode(encoder.encode(np.array(["a"])))[0] == "a"