Beispiel #1
0
    def fit_before_convert(self, dataset):
        """Fit the encoder."""
        # If in tf.data.Dataset, must be encoded already.
        if isinstance(dataset, tf.data.Dataset):
            return

        # Convert the data to np.ndarray.
        if isinstance(dataset, pd.DataFrame):
            dataset = dataset.values
        if isinstance(dataset, pd.Series):
            dataset = dataset.values.reshape(-1, 1)

        # If encoded.
        # TODO: support raw string labels for multi-label.
        if len(dataset.flatten()) != len(dataset):
            if self.num_classes:
                self._check_data_shape(dataset.shape[1:])
            return

        # Fit encoder.
        labels = set(dataset.flatten())
        if len(labels) < 2:
            raise ValueError(
                'Expect the target data for {name} to have '
                'at least 2 classes, but got {num_classes}.'.format(
                    name=self.name, num_classes=self.num_classes))
        if len(labels) == 2 and not self.multi_label:
            self.label_encoder = encoders.LabelEncoder()
        else:
            self.label_encoder = encoders.OneHotEncoder()
        self.label_encoder.fit(dataset)
Beispiel #2
0
 def fit_before_convert(self, dataset):
     # If in tf.data.Dataset, must be encoded already.
     if isinstance(dataset, tf.data.Dataset):
         if not self.num_classes:
             shape = utils.dataset_shape(dataset)[0]
             # Single column with 0s and 1s.
             if shape == 1:
                 self.num_classes = 2
             else:
                 self.num_classes = shape
         return
     if isinstance(dataset, pd.DataFrame):
         dataset = dataset.values
     if isinstance(dataset, pd.Series):
         dataset = dataset.values.reshape(-1, 1)
     # Not label.
     if len(dataset.flatten()) != len(dataset):
         self.num_classes = dataset.shape[1]
         return
     labels = set(dataset.flatten())
     if self.num_classes is None:
         self.num_classes = len(labels)
     if self.num_classes == 2:
         self.label_encoder = encoders.LabelEncoder()
     elif self.num_classes > 2:
         self.label_encoder = encoders.OneHotEncoder()
     elif self.num_classes < 2:
         raise ValueError(
             'Expect the target data for {name} to have '
             'at least 2 classes, but got {num_classes}.'.format(
                 name=self.name, num_classes=self.num_classes))
     self.label_encoder.fit(dataset)
Beispiel #3
0
def test_wrong_num_classes_error():
    encoder = encoders.OneHotEncoder(num_classes=3)

    with pytest.raises(ValueError) as info:
        encoder.fit(np.array(["a", "b", "a"]))

    assert "Expect 3 classes in the training targets" in str(info.value)
Beispiel #4
0
def test_one_hot_encoder_deserialize_transforms_to_np():
    encoder = encoders.OneHotEncoder()
    encoder.fit(np.array(["a", "b", "a"]))

    encoder = encoders.deserialize(encoders.serialize(encoder))
    one_hot = encoder.encode(np.array(["a"]))

    assert np.array_equal(one_hot, [[1, 0]]) or np.array_equal(one_hot, [[0, 1]])
Beispiel #5
0
def test_one_hot_encoder_decode_to_same_string():
    encoder = encoders.OneHotEncoder()
    encoder.fit(np.array(["a", "b", "a"]))

    assert encoder.decode(encoder.encode(np.array(["a"])))[0] == "a"