def test_transform(self): columns = ["Product number", "Quantity", "Retailer code"] data = self.go_sales[0][columns] sk_scaler = SkMinMaxScaler() rasl_scaler = RaslMinMaxScaler() sk_trained = sk_scaler.fit(data) rasl_trained = rasl_scaler.fit(data) sk_transformed = sk_trained.transform(data) rasl_transformed = rasl_trained.transform(data) self.assertAlmostEqual(sk_transformed[0, 0], rasl_transformed.iloc[0, 0]) self.assertAlmostEqual(sk_transformed[0, 1], rasl_transformed.iloc[0, 1]) self.assertAlmostEqual(sk_transformed[0, 2], rasl_transformed.iloc[0, 2]) self.assertAlmostEqual(sk_transformed[10, 0], rasl_transformed.iloc[10, 0]) self.assertAlmostEqual(sk_transformed[10, 1], rasl_transformed.iloc[10, 1]) self.assertAlmostEqual(sk_transformed[10, 2], rasl_transformed.iloc[10, 2]) self.assertAlmostEqual(sk_transformed[20, 0], rasl_transformed.iloc[20, 0]) self.assertAlmostEqual(sk_transformed[20, 1], rasl_transformed.iloc[20, 1]) self.assertAlmostEqual(sk_transformed[20, 2], rasl_transformed.iloc[20, 2])
def test_transform_range(self): columns = ["Product number", "Quantity", "Retailer code"] data = self.go_sales[0][columns] data_spark = lale.datasets.pandas2spark(data) sk_scaler = SkMinMaxScaler(feature_range=(-5, 5)) rasl_scaler = RaslMinMaxScaler(feature_range=(-5, 5)) sk_trained = sk_scaler.fit(data) rasl_trained = rasl_scaler.fit(data_spark) sk_transformed = sk_trained.transform(data) rasl_transformed = rasl_trained.transform(data_spark) rasl_transformed = rasl_transformed.toPandas() self.assertAlmostEqual(sk_transformed[0, 0], rasl_transformed.iloc[0, 0]) self.assertAlmostEqual(sk_transformed[0, 1], rasl_transformed.iloc[0, 1]) self.assertAlmostEqual(sk_transformed[0, 2], rasl_transformed.iloc[0, 2]) self.assertAlmostEqual(sk_transformed[10, 0], rasl_transformed.iloc[10, 0]) self.assertAlmostEqual(sk_transformed[10, 1], rasl_transformed.iloc[10, 1]) self.assertAlmostEqual(sk_transformed[10, 2], rasl_transformed.iloc[10, 2]) self.assertAlmostEqual(sk_transformed[20, 0], rasl_transformed.iloc[20, 0]) self.assertAlmostEqual(sk_transformed[20, 1], rasl_transformed.iloc[20, 1]) self.assertAlmostEqual(sk_transformed[20, 2], rasl_transformed.iloc[20, 2])
def test_fit(self): columns = ["Product number", "Quantity", "Retailer code"] data = self.go_sales[0][columns] sk_scaler = SkMinMaxScaler() rasl_scaler = RaslMinMaxScaler() sk_trained = sk_scaler.fit(data) rasl_trained = rasl_scaler.fit(data) self._check_trained(sk_trained, rasl_trained)
def test_fit_range(self): columns = ["Product number", "Quantity", "Retailer code"] data = self.go_sales[0][columns] data_spark = lale.datasets.pandas2spark(data) sk_scaler = SkMinMaxScaler(feature_range=(-5, 5)) rasl_scaler = RaslMinMaxScaler(feature_range=(-5, 5)) sk_trained = sk_scaler.fit(data) rasl_trained = rasl_scaler.fit(data_spark) self._check_trained(sk_trained, rasl_trained)
def test_partial_fit(self): columns = ["Product number", "Quantity", "Retailer code"] data = self.go_sales[0][columns] data1 = data[:10] data2 = data[10:100] data3 = data[100:] sk_scaler = SkMinMaxScaler() rasl_scaler = RaslMinMaxScaler() sk_trained = sk_scaler.partial_fit(data1) rasl_trained = rasl_scaler.partial_fit(data1) self._check_trained(sk_trained, rasl_trained) sk_trained = sk_scaler.partial_fit(data2) rasl_trained = rasl_scaler.partial_fit(data2) self._check_trained(sk_trained, rasl_trained) sk_trained = sk_scaler.partial_fit(data3) rasl_trained = rasl_scaler.partial_fit(data3) self._check_trained(sk_trained, rasl_trained)
def test_fit_transform(self, feature_range): """ Tests fit_transform against scikit-learn. """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) ds_arr = ds.array(x, block_size=(300, 2)) sc1 = SkMinMaxScaler(feature_range=feature_range) scaled_x = sc1.fit_transform(x) sc2 = MinMaxScaler(feature_range=feature_range) ds_scaled = sc2.fit_transform(ds_arr) self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect())) self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect())) self.assertEqual(ds_scaled._top_left_shape, ds_scaled._blocks[0][0].shape) self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) self.assertEqual(ds_arr.shape, ds_scaled.shape) self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
def test_irregular(self, feature_range): """ Test with an irregular array """ n_samples = 1500 x, y = make_blobs(n_samples=n_samples, random_state=170) transformation = [[0.6, -0.6], [-0.4, 0.8]] x = np.dot(x, transformation) ds_arr = ds.array(x, block_size=(300, 2)) ds_arr = ds_arr[297:602] x = x[297:602] sc1 = SkMinMaxScaler(feature_range=feature_range) scaled_x = sc1.fit_transform(x) sc2 = MinMaxScaler(feature_range=feature_range) ds_scaled = sc2.fit_transform(ds_arr) self.assertTrue(np.allclose(scaled_x, ds_scaled.collect())) self.assertTrue(np.allclose(sc1.data_min_, sc2.data_min_.collect())) self.assertTrue(np.allclose(sc1.data_max_, sc2.data_max_.collect())) self.assertEqual(ds_scaled._top_left_shape, compss_wait_on(ds_scaled._blocks[0][0]).shape) self.assertEqual(ds_arr._reg_shape, ds_scaled._reg_shape) self.assertEqual(ds_arr._top_left_shape, ds_scaled._top_left_shape) self.assertEqual(ds_arr.shape, ds_scaled.shape) self.assertEqual(ds_arr._n_blocks, ds_scaled._n_blocks)
def test_get_params(self): sk_scaler = SkMinMaxScaler() rasl_scaler = RaslMinMaxScaler() sk_params = sk_scaler.get_params() rasl_params = rasl_scaler.get_params() self.assertDictContainsSubset(sk_params, rasl_params)