def test_weighting(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=False)) X1 = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w1 = np.array([0., 1., 2., 1., 1.]) y1 = np.array([0, 0, 0, 1, 2]) X2 = np.array([[2., 4.], [3., 6.], [3., 6.], [4., 8.], [5., 10.]]) w2 = np.array([0., 1., 2., 1., 1.]) y2 = np.array([0, 0, 0, 1, 2]) pipeline.fit(X1, y1, w1) Xt1, _, _ = pipeline.transform(X1, y1, w1) means1 = Xt1.mean(axis=0) stds1 = Xt1.std(axis=0, ddof=1) pipeline.fit(X2, y2, w2) Xt2, _, _ = pipeline.transform(X2, y2, w2) means2 = Xt1.mean(axis=0) stds2 = Xt1.std(axis=0, ddof=1) for mean1, mean2 in zip(means1, means2): self.assertAlmostEqual(mean1, mean2) for std1, std2 in zip(stds1, stds2): self.assertAlmostEqual(std1, std2)
def test_indices_wildcard(self): pipeline = Pipeline() pipeline.add_step('overwrite', Overwrite(value=0), '*') self.fit(pipeline) Xt = self.transform(pipeline) self.X[:, :] = 0 self.assertTrue(np.allclose(self.X, Xt))
def test_unit_step(self): pipeline = Pipeline() pipeline.add_step('unit', BaseStep('unit'), [2, 4]) self.assertTrue(pipeline.has_step('unit')) self.fit(pipeline) Xt = self.transform(pipeline) self.assertTrue(np.allclose(self.X, Xt))
def test_Overwriter(self): pipeline = Pipeline() indices = [2, 4] pipeline.add_step('overwrite', Overwrite(value=0), indices) self.fit(pipeline) Xt = self.transform(pipeline) self.X[:, indices] = 0 self.assertTrue(np.allclose(self.X, Xt))
def test_PCA(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=False)) X = np.array([[1., 2.], [2., 4.], [3., 6.]]) w = np.array([1., 1., 1.]) y = np.zeros(3) abspcc = lambda a, b: abs(np.corrcoef(a, b)[0][1]) self.assertGreater(abspcc(X[:, 0], X[:, 1]), .99) pipeline.fit(X, y, w) pca = pipeline.get_step('pca') R = pca.R self.assertTrue(np.allclose(R.dot(R.T), np.identity(R.shape[0]))) Xt, yt, wt = pipeline.transform(X, y, w) self.assertTrue(np.allclose(y, yt)) self.assertTrue(np.allclose(w, wt)) cov = np.cov(Xt.T) self.assertAlmostEqual(cov[0, 0], 5.) self.assertAlmostEqual(cov[1, 0], 0.) self.assertAlmostEqual(cov[0, 1], 0.) self.assertAlmostEqual(cov[1, 1], 0.)
def test_ordering(self): pipeline = Pipeline() pipeline.add_step('wpca', BinaryWPCA()) mean1 = [.5, -.5] mean2 = [-.5, .5] cov = [[1., .9], [.9, 1.]] X1 = np.random.multivariate_normal(mean1, cov, 10000) X2 = np.random.multivariate_normal(mean2, cov, 10000) X = np.vstack((X1, X2)) y = np.append(np.zeros(10000), np.ones(10000)) w = np.ones(20000) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) is_diag = lambda X: np.allclose(X - np.diag(np.diagonal(X)), np.zeros(X.shape)) self.assertTrue(is_diag(np.cov(Xt.T))) sel1 = (y == 0) sel2 = (y == 1) distance = lambda x1, w1, x2, w2: wasserstein_distance( u_values=x1, v_values=x2, u_weights=w1, v_weights=w2) dist1 = distance(x1=X[sel1, 0], w1=w[sel1], x2=Xt[sel2, 0], w2=w[sel2]) dist2 = distance(x1=X[sel1, 1], w1=w[sel1], x2=Xt[sel2, 1], w2=w[sel2]) self.assertLess(dist1, dist2)
def test_fit_decoupling(self): pipeline = Pipeline() indices = [2, 4] pipeline.add_step('center', Center(), indices) X = np.random.rand(5, 10) X[:, indices[0]] = np.arange(5).astype(float) X[:, indices[1]] = np.arange(5).astype(float) * 2 mean = np.mean(X[:, indices], axis=0) self.assertAlmostEqual(mean[0], 2.) self.assertAlmostEqual(mean[1], 4.) y = np.arange(5) w = np.arange(5) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(np.ones_like(X), y, w) means = Xt.mean(axis=0) self.assertAlmostEqual(means[0], 1.) self.assertAlmostEqual(means[1], 1.) self.assertAlmostEqual(means[2], -1.) self.assertAlmostEqual(means[3], 1.) self.assertAlmostEqual(means[4], -3.)
def test_skipping(self): pipeline = Pipeline() pipeline \ .add_step('scaler1', Scale(factor=2)) \ .add_step('scaler2', Scale(factor=3)) \ .add_step('scaler3_4', Pipeline() .add_step('scaler3', Scale(5)) .add_step('scaler4', Scale(7))) \ .add_step('scaler5', Scale(11)) \ .add_step('scaler6', Scale(13)) self.fit(pipeline) Xt = self.transform(pipeline) self.assertTrue(np.allclose(self.X * 2 * 3 * 5 * 7 * 11 * 13, Xt)) Xt = self.transform(pipeline, first_step='scaler2') self.assertTrue(np.allclose(self.X * 3 * 5 * 7 * 11 * 13, Xt)) Xt = self.transform(pipeline, first_step='scaler3_4') self.assertTrue(np.allclose(self.X * 5 * 7 * 11 * 13, Xt)) Xt = self.transform(pipeline, first_step=['scaler3_4', 'scaler4']) self.assertTrue(np.allclose(self.X * 7 * 11 * 13, Xt)) Xt = self.transform(pipeline, last_step='scaler5') self.assertTrue(np.allclose(self.X * 2 * 3 * 5 * 7 * 11, Xt)) Xt = self.transform(pipeline, last_step='scaler3_4') self.assertTrue(np.allclose(self.X * 2 * 3 * 5 * 7, Xt)) Xt = self.transform(pipeline, last_step=['scaler3_4', 'scaler3']) self.assertTrue(np.allclose(self.X * 2 * 3 * 5, Xt)) Xt = self.transform(pipeline, first_step='scaler2', last_step=['scaler3_4', 'scaler3']) self.assertTrue(np.allclose(self.X * 3 * 5, Xt))
def test_standardize(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(standardize=True)) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w = np.array([1., 1., 1., 1., 1.]) y = np.array([0, 0, 0, 1, 2]) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) for mean, std in zip(Xt.mean(axis=0), Xt.std(axis=0, ddof=1)): self.assertAlmostEqual(mean, 0.) self.assertAlmostEqual(std, 1.)
def test_PCA_ignore(self): pipeline = Pipeline() pipeline.add_step('pca', PCA(ignore=[1, 2], standardize=False)) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.], [5., 10.]]) w = np.array([1., 1., 1., 1., 1.]) y = np.array([0, 0, 0, 1, 2]) pipeline.fit(X, y, w) Xt, _, _ = pipeline.transform(X, y, w) cov = np.cov(Xt[(y != 1) & (y != 2)].T) self.assertAlmostEqual(cov[0, 0], 5.) self.assertAlmostEqual(cov[1, 0], 0.) self.assertAlmostEqual(cov[0, 1], 0.) self.assertAlmostEqual(cov[1, 1], 0.)
def test_compose(self): pipeline = Pipeline() pipeline \ .add_step('overwrite1', Overwrite(value=1), [2, ]) \ .add_step('overwrite2', Overwrite(value=2), [4, ]) \ .add_step('scale', Scale(3), [2, 4]) self.fit(pipeline) Xt = self.transform(pipeline, last_step='overwrite2') self.assertTrue(np.all(Xt[:, 2] == 1)) self.assertTrue(np.all(Xt[:, 4] == 2)) self.assertTrue(np.allclose(self.X[:, [0, 1, 3]], Xt[:, [0, 1, 3]])) Xt = self.transform(pipeline) self.assertTrue(np.all(Xt[:, 2] == 3)) self.assertTrue(np.all(Xt[:, 4] == 6)) self.assertTrue(np.allclose(self.X[:, [0, 1, 3]], Xt[:, [0, 1, 3]]))
def test_set_params(self): pipeline = Pipeline() pipeline \ .add_step('scaler1', Scale()) \ .add_step('scaler2', Scale()) pipeline.set_step_params('scaler1', { 'factor': 2, }) pipeline.set_step_params('scaler2', { 'factor': 3, }) self.fit(pipeline) Xt = self.transform(pipeline) self.assertTrue(np.allclose(self.X * 6, Xt))
def test_standardize(self): pipeline = Pipeline() pipeline.add_step('std', Standardizer()) X = np.array([[1., 2.], [2., 4.], [3., 6.], [4., 8.]]) y = np.zeros(4) w = np.array([2., 3., 3., 4.]) pipeline.fit(X, y, w) step = pipeline.get_step('std') self.assertAlmostEqual(step.mean[0], 2.75) self.assertAlmostEqual(step.mean[1], 5.5) self.assertAlmostEqual(step.std[0], np.sqrt(14.25) / 3.) self.assertAlmostEqual(step.std[1], np.sqrt(57.) / 3.) Xt, _, _ = pipeline.transform(X, y, w) X = np.array([ (np.array([1., 2., 3., 4.]) - 2.75) / (np.sqrt(14.25) / 3.), (np.array([2., 4., 6., 8.]) - 5.5) / (np.sqrt(57.) / 3.) ]).T self.assertTrue(np.allclose(X, Xt))
self.mean = None def fit(self, X, y, w): self.mean = X.mean() def transform(self, X, y, w): X = np.ones_like(X) * self.mean return X, y, w def generate_data(m=10, n=5): X = np.random.rand(m, n) y = np.random.randint(low=0, high=2, size=m) w = np.random.rand(m) return X, y, w if __name__ == '__main__': pipeline = Pipeline() pipeline \ .add_step('step1', TestStep(), indices=[1, 3]) \ .add_step('step2', TestStep(), indices=[2, 4]) X1, y1, w1 = generate_data() pipeline.fit(X1, y1, w1) X2, y2, w2 = generate_data() Xt, yt, wt = pipeline.transform(X2, y2, w2) print(Xt)