def test_preprocessor_encode_drop(self): dataset = new_labels(random_seed=0) num_columns = 10 categorical = list(range(num_columns // 2)) preprocessor = StandardPreprocessor( categorical=categorical, normalization=None, encoding=EncodingStrategy.DROP, threshold=-1, ) num_batches = 10 for _ in range(num_batches): X, y = dataset.input_fn() # Inputs are all 1D self.assertEqual(1, len(shape_of_array(X))) self.assertEqual(1, len(shape_of_array(y))) # Feed labels to preprocessor labels_2d = [y] * num_columns y_ = preprocessor.fit(labels_2d).transform(labels_2d) self.assertEqual(2, y_.ndim) self.assertEqual(num_columns - len(categorical), y_.shape[0]) # Reverse transformation to get back original data should fail self.assertRaises(NotImplementedError, lambda: preprocessor.inverse_transform(y_))
def test_preprocessor_encode_onehot(self): dataset = new_labels(random_seed=0) preprocessor = StandardPreprocessor(categorical=[0], normalization=None, encoding=EncodingStrategy.ONEHOT, threshold=-1) num_batches = 10 for _ in range(num_batches): X, y = dataset.input_fn() # Inputs are all 1D self.assertEqual(1, len(shape_of_array(X))) self.assertEqual(1, len(shape_of_array(y))) # Feed labels to preprocessor y_ = preprocessor.fit(y).transform(y) self.assertEqual(2, len(shape_of_array(y_))) self.assertEqual(len(unique(y)), shape_of_array(y_)[0]) self.assertEqual(DTYPE_UINT8[0], y_.dtype) # Reverse transformation to get back original data y_ = preprocessor.inverse_transform(y_) self.assertEqual(1, len(shape_of_array(y_))) self.assertListEqual(y.tolist(), y_.tolist())
def test_preprocessor_normalization(self): dataset = new_line(random_seed=0) target_preprocessor = StandardPreprocessor(continuous=[0], threshold=-1) feature_preprocessor = StandardPreprocessor(continuous=[0], threshold=-1) num_batches = 8 for _ in range(num_batches): X, y = dataset.input_fn() # Inputs are all 1D self.assertEqual(1, len(shape_of_array(X))) self.assertEqual(1, len(shape_of_array(y))) # Scale up to be able to put pressure on normalization X = (X - 0.5) * 100 y = (y - 0.5) * 100 self.assertGreaterEqual(X.max() - X.min(), 1) self.assertGreaterEqual(y.max() - y.min(), 1) X_ = feature_preprocessor.fit(X).transform(X) self.assertEqual(1, X_.ndim) self.assertEqual(round(X_.mean()), 0, X_.mean()) self.assertNotAlmostEqual(X_.min(), 0) self.assertNotAlmostEqual(X_.max(), 0) y_ = target_preprocessor.fit(y).transform(y) self.assertEqual(round(y_.mean()), 0, y_.mean()) self.assertNotAlmostEqual(y_.min(), 0) self.assertNotAlmostEqual(y_.max(), 0)
def input_to_tensor(self, arr, flatten: bool = True) -> Tensor: """ Convert input n-dim array to tensor and copy to GPU if available. This function also transposes input n-dim array from column-first to sample-first shape. """ # For matrix multiplication performance, we actually need X transposed back to sample-first # We hope for an NN framework that supports our columnar approach to input one day... expected_shape = shape_of_array(arr[0])[1:] for col in arr[1:]: feature_shape = shape_of_array(col)[1:] assert expected_shape == feature_shape, ( "Consistent shape required for all input features. " f"Found {feature_shape}, expected {expected_shape}, input {shape_of_array(arr)}." ) arr = numpy.asarray(arr, dtype=DTYPE_FLOAT[0]) if arr.ndim <= 2: arr = numpy.transpose(arr) tensor = from_numpy(arr).float() if self._is_cuda: tensor = tensor.cuda() if flatten and not isinstance(self, HighDimensionalMixin) and arr.ndim > 2: # TODO: this should be done at the base library instead of by PyTorch tensor = self._flattener(tensor) return tensor
def test_get_shape_of_objects_ndarray(self): shape = (10, 2) arr = numpy.empty((shape[0]), dtype=object) for i in range(shape[0]): arr[i] = numpy.array(list(range(shape[1]))) self.assertEqual(shape, shape_of_array(arr)) self.assertNotEqual(shape, arr.shape)
def test_concat_arrays(self): array_count = 4 sample_size = 128 # 1D arrays array_list = [ generate_array_ints(n=sample_size) for _ in range(array_count) ] array_concat = concat_arrays(*array_list) self.assertEqual( shape_of_array(array_list[0])[1:], shape_of_array(array_concat)[1:]) self.assertEqual(len(array_concat), sample_size * array_count) # 2D arrays (2 columns) array_list = [ generate_array_ints(n=sample_size * 2).reshape(-1, 2) for _ in range(array_count) ] array_concat = concat_arrays(*array_list) self.assertEqual( shape_of_array(array_list[0])[1:], shape_of_array(array_concat)[1:]) self.assertEqual(len(array_concat), sample_size * array_count) # N-D arrays (array of vectors) array_list = generate_onehot_matrix(n=sample_size, ndim=array_count) array_concat = concat_arrays(*array_list) self.assertEqual( shape_of_array(array_list[0])[1:], shape_of_array(array_concat)[1:]) self.assertEqual(len(array_concat), sample_size * array_count) # N-D arrays (array of images) array_list = [ generate_images(n=sample_size) for _ in range(array_count) ] array_concat = concat_arrays(*array_list) self.assertEqual( shape_of_array(array_list[0])[1:], shape_of_array(array_concat)[1:]) self.assertEqual(len(array_concat), sample_size * array_count)
def input_to_tensor(self, arr: numpy.ndarray, flatten: bool = False): assert not flatten, "Input for this learner must be high dimensional" # This learner ONLY accepts RBG images, since most models are trained with that input_shape = shape_of_array(arr)[1:] assert len(input_shape) == 3 and input_shape[0] == 3, ( "%s supports only 3x224x224 features (2D matrix + RGB channels), found %r" % (self.__class__.__name__, input_shape)) # Apply transformations that put input images in the expected format # https://github.com/pytorch/examples/blob/42e5b996718797e45c46a25c55b031e6768f8440/imagenet/main.py#L89-L101 arr_transformed = [] for img in arr: img = numpy.array(img).astype(numpy.float) img = normalize(crop(img, height=224, width=224)) arr_transformed.append(img) # Use super's implementation to convert the numpy array to a tensor arr = super().input_to_tensor(arr_transformed, flatten=False) return arr
def test_get_shape_of_ndarray(self): for shape in [(10, ), (10, 10), (10, 10, 10), (20, 10, 10)]: arr = numpy.empty(shape) self.assertEqual(shape, shape_of_array(arr))
def test_get_shape_of_list_2d(self): shape = (10, 2) arr = [list(range(shape[1])) for _ in range(shape[0])] self.assertEqual(shape, shape_of_array(arr))
def test_get_shape_of_list_1d(self): shape = (10, ) arr = list(range(shape[0])) self.assertEqual(shape, shape_of_array(arr))