def test_hstack(self): a = np.array( [(0.0, 0.1), (1.0, 1.1), (2.0, 2.1)], dtype=[('f0', float), ('f1', float)]) b = np.array( [(0.2, 0.3), (1.2, 1.3), (2.2, 2.3)], dtype=[('f2', float), ('f3', float)]) ctrl = np.array( [(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) p = Pipeline() np_in_a = p.add(NumpyRead(a)) np_in_b = p.add(NumpyRead(b)) hstack = p.add(HStack(2)) hstack(np_in_a, np_in_b) out = p.add(NumpyWrite()) out(hstack) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_hstack(self): a = np.array([(0.0, 0.1), (1.0, 1.1), (2.0, 2.1)], dtype=[('f0', float), ('f1', float)]) b = np.array([(0.2, 0.3), (1.2, 1.3), (2.2, 2.3)], dtype=[('f2', float), ('f3', float)]) ctrl = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) p = Pipeline() np_in_a = p.add(NumpyRead(a)) np_in_b = p.add(NumpyRead(b)) hstack = p.add(HStack(2)) hstack(np_in_a, np_in_b) out = p.add(NumpyWrite()) out(hstack) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_feature_importance(self): #50% 20% 100% predictability X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]]) y = np.array([1, 0, 1, 1, 0, 1, 1, 1]) p = Pipeline() X_in = p.add(NumpyRead(X)) y_in = p.add(NumpyRead(y)) est = p.add( wrap_and_make_instance('sklearn.ensemble.RandomForestClassifier', random_state=0)) est(X_train=X_in, y_train=y_in) out = p.add(NumpyWrite()) out(est['feature_importances']) p.run() result = out.get_stage().result['col_name'] ctrl = np.array(['f2', 'f0', 'f1']) self.assertTrue(np.array_equal(ctrl, result))
def test_generate_feature(self): in_array = np.array( [(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) ctrl = np.array( [(10.4,), (12.4,), (14.4,)], dtype=[('f0', float)]) cols = ['f1', 'f3'] f = lambda tab: tab['f1'] + tab['f3'] + 10 p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result)) ctrl = np.array( [(1, 10.1), (11, 11.1), (21, 12.1)], dtype=[('times10', float), ('add10', float)]) cols = ['f1'] f = lambda tab: np.array(zip(tab['f1'] * 10, tab['f1'] + 10)) out_col_names = ['times10', 'add10'] p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols, out_col_names)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_generate_feature(self): in_array = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) ctrl = np.array([(10.4, ), (12.4, ), (14.4, )], dtype=[('f0', float)]) cols = ['f1', 'f3'] f = lambda tab: tab['f1'] + tab['f3'] + 10 p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result)) ctrl = np.array([(1, 10.1), (11, 11.1), (21, 12.1)], dtype=[('times10', float), ('add10', float)]) cols = ['f1'] f = lambda tab: np.array(zip(tab['f1'] * 10, tab['f1'] + 10)) out_col_names = ['times10', 'add10'] p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols, out_col_names)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_feature_importance(self): # 50% 20% 100% predictability X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]]) y = np.array([1, 0, 1, 1, 0, 1, 1, 1]) p = Pipeline() X_in = p.add(NumpyRead(X)) y_in = p.add(NumpyRead(y)) est = p.add(wrap_and_make_instance("sklearn.ensemble.RandomForestClassifier", random_state=0)) est(X_train=X_in, y_train=y_in) out = p.add(NumpyWrite()) out(est["feature_importances"]) p.run() result = out.get_stage().result["col_name"] ctrl = np.array(["f2", "f0", "f1"]) self.assertTrue(np.array_equal(ctrl, result))
def test_wrap_cross_validation(self): X = np.array( [ (0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76), (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21), (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21), ], dtype=[("id", int), ("year", int), ("fine", float)], ) y = np.array([(0,), (1,), (0,), (1,), (0,), (1,), (0,), (1,), (0,)], dtype=[("category", int)]) ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])] p = Pipeline() node_X_in = p.add(NumpyRead(X)) node_y_in = p.add(NumpyRead(y)) node_just_time = p.add(SplitColumns(["year"])) node_just_time(node_X_in) training_windows = by_window_ranges(1999, 2000, 2004, 2) testing_windows = by_window_ranges(2001, 2002, 2006, 2) mode = ByWindowMode.SLIDING node_cv = p.add( wrap_and_make_instance( "upsg.transform.partition_iterators.ByWindow", n_arrays=2, training_windows=training_windows, testing_windows=testing_windows, mode=ByWindowMode.SLIDING, ) ) node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time) self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds)) out_nodes = [] for i in xrange(len(ctrl_inds)): train_node_X = p.add(NumpyWrite()) train_node_X(node_cv["train0_{}".format(i)]) train_node_y = p.add(NumpyWrite()) train_node_y(node_cv["train1_{}".format(i)]) test_node_X = p.add(NumpyWrite()) test_node_X(node_cv["test0_{}".format(i)]) test_node_y = p.add(NumpyWrite()) test_node_y(node_cv["test1_{}".format(i)]) out_nodes.append((train_node_X, train_node_y, test_node_X, test_node_y)) p.run() for i, (train_node_X, train_node_y, test_node_X, test_node_y) in enumerate(out_nodes): self.assertTrue(np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]])) self.assertTrue(np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]])) self.assertTrue(np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]])) self.assertTrue(np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))
def test_wrap_cross_validation(self): X = np.array([(0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76), (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21), (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21)], dtype=[('id', int), ('year', int), ('fine', float)]) y = np.array([(0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, )], dtype=[('category', int)]) ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])] p = Pipeline() node_X_in = p.add(NumpyRead(X)) node_y_in = p.add(NumpyRead(y)) node_just_time = p.add(SplitColumns(['year'])) node_just_time(node_X_in) training_windows = by_window_ranges(1999, 2000, 2004, 2) testing_windows = by_window_ranges(2001, 2002, 2006, 2) mode = ByWindowMode.SLIDING node_cv = p.add( wrap_and_make_instance( 'upsg.transform.partition_iterators.ByWindow', n_arrays=2, training_windows=training_windows, testing_windows=testing_windows, mode=ByWindowMode.SLIDING)) node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time) self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds)) out_nodes = [] for i in xrange(len(ctrl_inds)): train_node_X = p.add(NumpyWrite()) train_node_X(node_cv['train0_{}'.format(i)]) train_node_y = p.add(NumpyWrite()) train_node_y(node_cv['train1_{}'.format(i)]) test_node_X = p.add(NumpyWrite()) test_node_X(node_cv['test0_{}'.format(i)]) test_node_y = p.add(NumpyWrite()) test_node_y(node_cv['test1_{}'.format(i)]) out_nodes.append( (train_node_X, train_node_y, test_node_X, test_node_y)) p.run() for i, (train_node_X, train_node_y, test_node_X, test_node_y) in \ enumerate(out_nodes): self.assertTrue( np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]])) self.assertTrue( np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))