def test_hstack(self):
        a = np.array(
                [(0.0, 0.1), (1.0, 1.1), (2.0, 2.1)], 
                dtype=[('f0', float), ('f1', float)])
        b = np.array(
                [(0.2, 0.3), (1.2, 1.3), (2.2, 2.3)], 
                dtype=[('f2', float), ('f3', float)])
        ctrl = np.array(
                [(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), 
                 (2.0, 2.1, 2.2, 2.3)], 
                dtype=[('f0', float), ('f1', float), ('f2', float), 
                       ('f3', float)])

        p = Pipeline()

        np_in_a = p.add(NumpyRead(a))

        np_in_b = p.add(NumpyRead(b))

        hstack = p.add(HStack(2))
        hstack(np_in_a, np_in_b)

        out = p.add(NumpyWrite())

        out(hstack)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
Exemple #2
0
    def test_hstack(self):
        a = np.array([(0.0, 0.1), (1.0, 1.1), (2.0, 2.1)],
                     dtype=[('f0', float), ('f1', float)])
        b = np.array([(0.2, 0.3), (1.2, 1.3), (2.2, 2.3)],
                     dtype=[('f2', float), ('f3', float)])
        ctrl = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3),
                         (2.0, 2.1, 2.2, 2.3)],
                        dtype=[('f0', float), ('f1', float), ('f2', float),
                               ('f3', float)])

        p = Pipeline()

        np_in_a = p.add(NumpyRead(a))

        np_in_b = p.add(NumpyRead(b))

        hstack = p.add(HStack(2))
        hstack(np_in_a, np_in_b)

        out = p.add(NumpyWrite())

        out(hstack)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
Exemple #3
0
    def test_feature_importance(self):

        #50% 20% 100% predictability
        X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0],
                      [0, 0, 1], [1, 0, 1], [0, 0, 1]])
        y = np.array([1, 0, 1, 1, 0, 1, 1, 1])

        p = Pipeline()

        X_in = p.add(NumpyRead(X))
        y_in = p.add(NumpyRead(y))

        est = p.add(
            wrap_and_make_instance('sklearn.ensemble.RandomForestClassifier',
                                   random_state=0))
        est(X_train=X_in, y_train=y_in)

        out = p.add(NumpyWrite())
        out(est['feature_importances'])

        p.run()

        result = out.get_stage().result['col_name']
        ctrl = np.array(['f2', 'f0', 'f1'])
        self.assertTrue(np.array_equal(ctrl, result))
    def test_generate_feature(self):
        in_array = np.array(
                [(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), 
                 (2.0, 2.1, 2.2, 2.3)], 
                dtype=[('f0', float), ('f1', float), ('f2', float), 
                       ('f3', float)])
        ctrl = np.array(
                [(10.4,), (12.4,), (14.4,)], 
                dtype=[('f0', float)])
        cols = ['f1', 'f3']
        f = lambda tab: tab['f1'] + tab['f3'] + 10

        p = Pipeline()

        np_in = p.add(NumpyRead(in_array))

        gen_feat = p.add(GenerateFeature(f, cols))
        gen_feat(np_in)

        out = p.add(NumpyWrite())
        out(gen_feat)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))

        ctrl = np.array(
                [(1, 10.1), (11, 11.1), (21, 12.1)],
                dtype=[('times10', float), ('add10', float)])
        cols = ['f1']
        f = lambda tab: np.array(zip(tab['f1'] * 10, tab['f1'] + 10))
        out_col_names = ['times10', 'add10']

        p = Pipeline()

        np_in = p.add(NumpyRead(in_array))

        gen_feat = p.add(GenerateFeature(f, cols, out_col_names))
        gen_feat(np_in)

        out = p.add(NumpyWrite())
        out(gen_feat)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
Exemple #5
0
    def test_generate_feature(self):
        in_array = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3),
                             (2.0, 2.1, 2.2, 2.3)],
                            dtype=[('f0', float), ('f1', float), ('f2', float),
                                   ('f3', float)])
        ctrl = np.array([(10.4, ), (12.4, ), (14.4, )], dtype=[('f0', float)])
        cols = ['f1', 'f3']
        f = lambda tab: tab['f1'] + tab['f3'] + 10

        p = Pipeline()

        np_in = p.add(NumpyRead(in_array))

        gen_feat = p.add(GenerateFeature(f, cols))
        gen_feat(np_in)

        out = p.add(NumpyWrite())
        out(gen_feat)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))

        ctrl = np.array([(1, 10.1), (11, 11.1), (21, 12.1)],
                        dtype=[('times10', float), ('add10', float)])
        cols = ['f1']
        f = lambda tab: np.array(zip(tab['f1'] * 10, tab['f1'] + 10))
        out_col_names = ['times10', 'add10']

        p = Pipeline()

        np_in = p.add(NumpyRead(in_array))

        gen_feat = p.add(GenerateFeature(f, cols, out_col_names))
        gen_feat(np_in)

        out = p.add(NumpyWrite())
        out(gen_feat)

        p.run()

        self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
Exemple #6
0
    def test_feature_importance(self):

        # 50% 20% 100% predictability
        X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]])
        y = np.array([1, 0, 1, 1, 0, 1, 1, 1])

        p = Pipeline()

        X_in = p.add(NumpyRead(X))
        y_in = p.add(NumpyRead(y))

        est = p.add(wrap_and_make_instance("sklearn.ensemble.RandomForestClassifier", random_state=0))
        est(X_train=X_in, y_train=y_in)

        out = p.add(NumpyWrite())
        out(est["feature_importances"])

        p.run()

        result = out.get_stage().result["col_name"]
        ctrl = np.array(["f2", "f0", "f1"])
        self.assertTrue(np.array_equal(ctrl, result))
Exemple #7
0
    def test_wrap_cross_validation(self):
        X = np.array(
            [
                (0, 2001, 12.31),
                (1, 1999, 14.32),
                (2, 1999, 120.76),
                (3, 2002, 32.12),
                (4, 2004, 98.64),
                (5, 2005, 32.21),
                (6, 2002, 100.23),
                (7, 2006, 123.40),
                (8, 2000, 72.21),
            ],
            dtype=[("id", int), ("year", int), ("fine", float)],
        )
        y = np.array([(0,), (1,), (0,), (1,), (0,), (1,), (0,), (1,), (0,)], dtype=[("category", int)])
        ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])]
        p = Pipeline()

        node_X_in = p.add(NumpyRead(X))

        node_y_in = p.add(NumpyRead(y))

        node_just_time = p.add(SplitColumns(["year"]))
        node_just_time(node_X_in)

        training_windows = by_window_ranges(1999, 2000, 2004, 2)
        testing_windows = by_window_ranges(2001, 2002, 2006, 2)
        mode = ByWindowMode.SLIDING
        node_cv = p.add(
            wrap_and_make_instance(
                "upsg.transform.partition_iterators.ByWindow",
                n_arrays=2,
                training_windows=training_windows,
                testing_windows=testing_windows,
                mode=ByWindowMode.SLIDING,
            )
        )
        node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time)

        self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds))
        out_nodes = []
        for i in xrange(len(ctrl_inds)):
            train_node_X = p.add(NumpyWrite())
            train_node_X(node_cv["train0_{}".format(i)])

            train_node_y = p.add(NumpyWrite())
            train_node_y(node_cv["train1_{}".format(i)])

            test_node_X = p.add(NumpyWrite())
            test_node_X(node_cv["test0_{}".format(i)])

            test_node_y = p.add(NumpyWrite())
            test_node_y(node_cv["test1_{}".format(i)])

            out_nodes.append((train_node_X, train_node_y, test_node_X, test_node_y))
        p.run()

        for i, (train_node_X, train_node_y, test_node_X, test_node_y) in enumerate(out_nodes):
            self.assertTrue(np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]]))
            self.assertTrue(np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]]))
            self.assertTrue(np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]]))
            self.assertTrue(np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))
Exemple #8
0
    def test_wrap_cross_validation(self):
        X = np.array([(0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76),
                      (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21),
                      (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21)],
                     dtype=[('id', int), ('year', int), ('fine', float)])
        y = np.array([(0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, ), (1, ),
                      (0, )],
                     dtype=[('category', int)])
        ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])]
        p = Pipeline()

        node_X_in = p.add(NumpyRead(X))

        node_y_in = p.add(NumpyRead(y))

        node_just_time = p.add(SplitColumns(['year']))
        node_just_time(node_X_in)

        training_windows = by_window_ranges(1999, 2000, 2004, 2)
        testing_windows = by_window_ranges(2001, 2002, 2006, 2)
        mode = ByWindowMode.SLIDING
        node_cv = p.add(
            wrap_and_make_instance(
                'upsg.transform.partition_iterators.ByWindow',
                n_arrays=2,
                training_windows=training_windows,
                testing_windows=testing_windows,
                mode=ByWindowMode.SLIDING))
        node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time)

        self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds))
        out_nodes = []
        for i in xrange(len(ctrl_inds)):
            train_node_X = p.add(NumpyWrite())
            train_node_X(node_cv['train0_{}'.format(i)])

            train_node_y = p.add(NumpyWrite())
            train_node_y(node_cv['train1_{}'.format(i)])

            test_node_X = p.add(NumpyWrite())
            test_node_X(node_cv['test0_{}'.format(i)])

            test_node_y = p.add(NumpyWrite())
            test_node_y(node_cv['test1_{}'.format(i)])

            out_nodes.append(
                (train_node_X, train_node_y, test_node_X, test_node_y))
        p.run()

        for i, (train_node_X, train_node_y, test_node_X, test_node_y) in \
            enumerate(out_nodes):
            self.assertTrue(
                np.array_equal(train_node_X.get_stage().result,
                               X[ctrl_inds[i][0]]))
            self.assertTrue(
                np.array_equal(train_node_y.get_stage().result,
                               y[ctrl_inds[i][0]]))
            self.assertTrue(
                np.array_equal(test_node_X.get_stage().result,
                               X[ctrl_inds[i][1]]))
            self.assertTrue(
                np.array_equal(test_node_y.get_stage().result,
                               y[ctrl_inds[i][1]]))