コード例 #1
0
ファイル: test_model.py プロジェクト: Najah-lshanableh/UPSG
    def test_cross_validation_score(self):
        rows = 100
        folds = 10

        X = np.random.random((rows, 10))
        y = np.random.randint(0, 2, (rows))
        
        p = Pipeline()

        np_in_X = p.add(NumpyRead(X))
        np_in_y = p.add(NumpyRead(y))

        cv_score = p.add(CrossValidationScore(wrap(SVC), 'score', {}, folds,
                                              random_state=0))               
        np_in_X['output'] > cv_score['X_train']
        np_in_y['output'] > cv_score['y_train']

        score_out = p.add(CSVWrite(self._tmp_files('out.csv')))
        cv_score['score'] > score_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')['f0']

        ctrl_kf = SKKFold(rows, folds, random_state=0)
        ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf))

        self.assertTrue(np.allclose(ctrl, result))
コード例 #2
0
ファイル: test_model.py プロジェクト: Najah-lshanableh/UPSG
    def test_cross_validation_score(self):
        rows = 100
        folds = 10

        X = np.random.random((rows, 10))
        y = np.random.randint(0, 2, (rows))

        p = Pipeline()

        np_in_X = p.add(NumpyRead(X))
        np_in_y = p.add(NumpyRead(y))

        cv_score = p.add(
            CrossValidationScore(wrap(SVC), 'score', {}, folds,
                                 random_state=0))
        np_in_X['output'] > cv_score['X_train']
        np_in_y['output'] > cv_score['y_train']

        score_out = p.add(CSVWrite(self._tmp_files('out.csv')))
        cv_score['score'] > score_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')['f0']

        ctrl_kf = SKKFold(rows, folds, random_state=0)
        ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf))

        self.assertTrue(np.allclose(ctrl, result))
コード例 #3
0
ファイル: test_model.py プロジェクト: dssg/UPSG
    def test_grid_search(self):
        """

        Simulates behavior of example in:
        http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV

        """
        folds = 2

        parameters = {
            'kernel': (
                'rbf',
                'linear'),
            'C': [
                1,
                10,
                100],
            'random_state': [0]}
        iris = datasets.load_iris()
        iris_data = iris.data
        iris_target = iris.target

        p = Pipeline()

        node_data = p.add(NumpyRead(iris_data))
        node_target = p.add(NumpyRead(iris_target))
        node_split = p.add(SplitTrainTest(2, random_state=1))
        node_search = p.add(GridSearch(
            wrap(SVC), 
            parameters, 
            'score', 
            cv_stage_kwargs={'n_folds': folds}))
        node_params_out = p.add(CSVWrite(self._tmp_files.get('out.csv')))

        node_data['output'] > node_split['input0']
        node_target['output'] > node_split['input1']
        node_split['train0'] > node_search['X_train']
        node_split['train1'] > node_search['y_train']
        node_split['test0'] > node_search['X_test']
        node_split['test1'] > node_search['y_test']
        node_search['params_out'] > node_params_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')

        ctrl_X_train, _, ctrl_y_train, _ = train_test_split(
            iris_data, iris_target, random_state=1)
        ctrl_cv = SKKFold(ctrl_y_train.size, folds)
        ctrl_search = grid_search.GridSearchCV(SVC(), parameters, cv=ctrl_cv)
        ctrl_search.fit(ctrl_X_train, ctrl_y_train)
        control = ctrl_search.best_params_

        # TODO a number of configurations tie here, and sklearn picks a different
        # best configuration than upsg does (although they have the same score)
        # ideally, we want to find some parameters where there is a clear 
        # winner
        control = {'C': 10, 'kernel': 'linear', 'random_state': 0}

        self.assertEqual(np_sa_to_dict(np.array([result])), control)
コード例 #4
0
ファイル: test_stage.py プロジェクト: dssg/UPSG
 def test_pickle(self):
     # TODO this just makes sure the object can be pickled. It doesn't
     # verify that the unpickled object is correct
     uo = UObject(UObjectPhase.Write)
     np_array = np.array([[0]])
     uo.from_np(np_array)
     self.__pickle('upsg.export.csv.CSVWrite', path_of_data('_out.csv'))
     self.__pickle('upsg.fetch.csv.CSVRead', path_of_data('mixed_csv.csv'))
     self.__pickle('upsg.fetch.np.NumpyRead', np.array([[0]]))
     self.__pickle('upsg.transform.split.SplitTrainTest')
     self.__pickle('upsg.transform.split.SplitY', 0)
     self.__pickle('upsg.transform.rename_cols.RenameCols',
                   {'name': 'rename'})
     self.__pickle(wrap('sklearn.preprocessing.Imputer'), strategy='mean',
                   missing_values='NaN')
     self.__pickle(wrap('sklearn.svm.SVC'), gamma=0.1)
     self.__pickle(wrap('sklearn.metrics.roc_curve'))
コード例 #5
0
    def test_apply_to_selected_cols(self):
        rows = 100
        cols = 10
        random_data = np.random.rand(rows, cols)
        # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols
        nans = 701
        with_nans = np.copy(random_data)
        for r, c in zip(np.random.randint(0, rows, nans), 
                        np.random.randint(0, cols, nans)):
            with_nans[r,c] = np.NaN
        trials = ((wrap('sklearn.preprocessing.StandardScaler'), 
                   (), 
                   'X_train', 
                   'X_new',
                   np_nd_to_sa(random_data)), 
                  (FillNA, 
                   (0,), 
                   'input', 
                   'output',
                   np_nd_to_sa(with_nans)))
        sel_cols = ('f2', 'f3', 'f4')
        trials = trials[1:]

        for trans_cls, args, in_key, out_key, in_data in trials:
            p = Pipeline()

            node_in = p.add(NumpyRead(in_data))
            node_selected = p.add(
                ApplyToSelectedCols(sel_cols, trans_cls, *args))
            node_in['output'] > node_selected[in_key]

            node_out = p.add(NumpyWrite())
            node_selected[out_key] > node_out['input']

            node_ctrl_split = p.add(SplitColumns(sel_cols))
            node_in['output'] > node_ctrl_split['input']

            node_ctrl_trans = p.add(trans_cls(*args))
            node_ctrl_split['output'] > node_ctrl_trans[in_key]

            node_ctrl_out = p.add(NumpyWrite())
            node_ctrl_trans[out_key] > node_ctrl_out['input']

            self.run_pipeline(p)

            result = node_out.get_stage().result
            ctrl = node_ctrl_out.get_stage().result

            for col in in_data.dtype.names:
                if col in sel_cols:
                    self.assertTrue(np.allclose(result[col], ctrl[col]))
                else:
                    self.assertTrue(np.allclose(
                        np.nan_to_num(result[col]), 
                        np.nan_to_num(in_data[col])))
コード例 #6
0
ファイル: test_model.py プロジェクト: dssg/UPSG
    def test_cross_validation_score(self):
        rows = 100
        folds = 10

        X = np.random.random((rows, 10))
        y = np.random.randint(0, 2, (rows))

        trials = ((SKKFold, 
                   {'random_state': 0, 'n_folds': folds}, 
                   {'n': rows, 'n_folds': folds, 'random_state': 0}),
                  (StratifiedKFold, 
                   {'random_state': 0, 'n_folds': folds}, 
                   {'y': y, 'n_folds': folds, 'random_state': 0}))

        
        for PartIter, res_kwargs, ctrl_kwargs in trials:

            p = Pipeline()

            np_in_X = p.add(NumpyRead(X))
            np_in_y = p.add(NumpyRead(y))

            cv_score = p.add(CrossValidationScore(
                wrap(SVC), 
                {},
                'score', 
                wrap(PartIter),
                res_kwargs))
            np_in_X['output'] > cv_score['X_train']
            np_in_y['output'] > cv_score['y_train']

            score_out = p.add(CSVWrite(self._tmp_files('out.csv')))
            cv_score['score'] > score_out['input']

            self.run_pipeline(p)

            result = self._tmp_files.csv_read('out.csv')['f0']

            ctrl_kf = PartIter(**ctrl_kwargs)
            ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf))

            self.assertTrue(np.allclose(ctrl, result))
コード例 #7
0
    def test_apply_to_selected_cols(self):
        rows = 100
        cols = 10
        random_data = np.random.rand(rows, cols)
        # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols
        nans = 701
        with_nans = np.copy(random_data)
        for r, c in zip(np.random.randint(0, rows, nans),
                        np.random.randint(0, cols, nans)):
            with_nans[r, c] = np.NaN
        trials = ((wrap('sklearn.preprocessing.StandardScaler'), (), 'X_train',
                   'X_new', np_nd_to_sa(random_data)),
                  (FillNA, (0, ), 'input', 'output', np_nd_to_sa(with_nans)))
        sel_cols = ('f2', 'f3', 'f4')
        trials = trials[1:]

        for trans_cls, args, in_key, out_key, in_data in trials:
            p = Pipeline()

            node_in = p.add(NumpyRead(in_data))
            node_selected = p.add(
                ApplyToSelectedCols(sel_cols, trans_cls, *args))
            node_in['output'] > node_selected[in_key]

            node_out = p.add(NumpyWrite())
            node_selected[out_key] > node_out['input']

            node_ctrl_split = p.add(SplitColumns(sel_cols))
            node_in['output'] > node_ctrl_split['input']

            node_ctrl_trans = p.add(trans_cls(*args))
            node_ctrl_split['output'] > node_ctrl_trans[in_key]

            node_ctrl_out = p.add(NumpyWrite())
            node_ctrl_trans[out_key] > node_ctrl_out['input']

            self.run_pipeline(p)

            result = node_out.get_stage().result
            ctrl = node_ctrl_out.get_stage().result

            for col in in_data.dtype.names:
                if col in sel_cols:
                    self.assertTrue(np.allclose(result[col], ctrl[col]))
                else:
                    self.assertTrue(
                        np.allclose(np.nan_to_num(result[col]),
                                    np.nan_to_num(in_data[col])))
コード例 #8
0
ファイル: test_wrap.py プロジェクト: Najah-lshanableh/UPSG
 def test_from_string(self):
     WrappedImputer = wrap('sklearn.preprocessing.Imputer')
     impute_stage = WrappedImputer()
コード例 #9
0
ファイル: test_wrap.py プロジェクト: Najah-lshanableh/UPSG
 def test_from_module(self):
     WrappedImputer = wrap(Imputer)
     impute_stage = WrappedImputer()
コード例 #10
0
 def test_from_string(self):
     WrappedImputer = wrap('sklearn.preprocessing.Imputer')
     impute_stage = WrappedImputer()
コード例 #11
0
 def test_from_module(self):
     WrappedImputer = wrap(Imputer)
     impute_stage = WrappedImputer()