def test_cross_validation_score(self): rows = 100 folds = 10 X = np.random.random((rows, 10)) y = np.random.randint(0, 2, (rows)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) cv_score = p.add( CrossValidationScore(wrap(SVC), 'score', {}, folds, random_state=0)) np_in_X['output'] > cv_score['X_train'] np_in_y['output'] > cv_score['y_train'] score_out = p.add(CSVWrite(self._tmp_files('out.csv'))) cv_score['score'] > score_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv')['f0'] ctrl_kf = SKKFold(rows, folds, random_state=0) ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf)) self.assertTrue(np.allclose(ctrl, result))
def test_feature_importance(self): #50% 20% 100% predictability X = np.array([[1, 0, 1], [1, 0, 0], [1, 0, 1], [0, 0, 1], [0, 0, 0], [0, 0, 1], [1, 0, 1], [0, 0, 1]]) y = np.array([1, 0, 1, 1, 0, 1, 1, 1]) p = Pipeline() X_in = p.add(NumpyRead(X)) y_in = p.add(NumpyRead(y)) est = p.add( wrap_and_make_instance('sklearn.ensemble.RandomForestClassifier', random_state=0)) est(X_train=X_in, y_train=y_in) out = p.add(NumpyWrite()) out(est['feature_importances']) p.run() result = out.get_stage().result['col_name'] ctrl = np.array(['f2', 'f0', 'f1']) self.assertTrue(np.array_equal(ctrl, result))
def test_merge(self): a1 = np.array([(0, 'Lisa', 2), (1, 'Bill', 1), (2, 'Fred', 2), (3, 'Samantha', 2), (4, 'Augustine', 1), (5, 'William', 0)], dtype=[('id', int), ('name', 'S64'), ('dept_id', int)]) a2 = np.array([(0, 'accts receivable'), (1, 'accts payable'), (2, 'shipping')], dtype=[('id', int), ('name', 'S64')]) kwargs = {} p = Pipeline() a1_in = p.add(NumpyRead(a1)) a2_in = p.add(NumpyRead(a2)) merge = p.add(Merge('dept_id', 'id', **kwargs)) out = p.add(NumpyWrite()) out(merge(a1_in, a2_in)) self.run_pipeline(p) result = out.get_stage().result ctrl = obj_to_str( pd.DataFrame(a1).merge(pd.DataFrame(a2), left_on='dept_id', right_on='id').to_records(index=False)) assert (np.array_equal(result, ctrl))
def test_grid_search(self): """ Simulates behavior of example in: http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV """ folds = 2 parameters = { 'kernel': ( 'rbf', 'linear'), 'C': [ 1, 10, 100], 'random_state': [0]} iris = datasets.load_iris() iris_data = iris.data iris_target = iris.target p = Pipeline() node_data = p.add(NumpyRead(iris_data)) node_target = p.add(NumpyRead(iris_target)) node_split = p.add(SplitTrainTest(2, random_state=1)) node_search = p.add(GridSearch( wrap(SVC), parameters, 'score', cv_stage_kwargs={'n_folds': folds})) node_params_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_search['X_train'] node_split['train1'] > node_search['y_train'] node_split['test0'] > node_search['X_test'] node_split['test1'] > node_search['y_test'] node_search['params_out'] > node_params_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl_X_train, _, ctrl_y_train, _ = train_test_split( iris_data, iris_target, random_state=1) ctrl_cv = SKKFold(ctrl_y_train.size, folds) ctrl_search = grid_search.GridSearchCV(SVC(), parameters, cv=ctrl_cv) ctrl_search.fit(ctrl_X_train, ctrl_y_train) control = ctrl_search.best_params_ # TODO a number of configurations tie here, and sklearn picks a different # best configuration than upsg does (although they have the same score) # ideally, we want to find some parameters where there is a clear # winner control = {'C': 10, 'kernel': 'linear', 'random_state': 0} self.assertEqual(np_sa_to_dict(np.array([result])), control)
def test_multiclassify(self): samples = 150 features = 3 folds = 2 X = np.random.random((samples, features)) y = np.random.randint(0, 2, (samples)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) split_train_test = p.add(SplitTrainTest(2)) np_in_X['output'] > split_train_test['input0'] np_in_y['output'] > split_train_test['input1'] multi = p.add( Multiclassify('score', self._tmp_files('report.html'), None, folds)) split_train_test['train0'] > multi['X_train'] split_train_test['test0'] > multi['X_test'] split_train_test['train1'] > multi['y_train'] split_train_test['test1'] > multi['y_test'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('report.html')))
def test_hstack(self): a = np.array([(0.0, 0.1), (1.0, 1.1), (2.0, 2.1)], dtype=[('f0', float), ('f1', float)]) b = np.array([(0.2, 0.3), (1.2, 1.3), (2.2, 2.3)], dtype=[('f2', float), ('f3', float)]) ctrl = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) p = Pipeline() np_in_a = p.add(NumpyRead(a)) np_in_b = p.add(NumpyRead(b)) hstack = p.add(HStack(2)) hstack(np_in_a, np_in_b) out = p.add(NumpyWrite()) out(hstack) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_moving_params(self): digits = datasets.load_digits() digits_data = digits.data digits_target = digits.target p = Pipeline() node_data = p.add(NumpyRead(digits_data)) node_target = p.add(NumpyRead(digits_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) # parameters from # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html node_clf1 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=5, n_estimators=10, max_features=1, random_state=0)) node_clf2 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=12, n_estimators=100, max_features=1000)) node_params_out_1 = p.add( CSVWrite(self._tmp_files.get('out_params_1.csv'))) node_params_out_2 = p.add( CSVWrite(self._tmp_files.get('out_params_2.csv'))) node_pred_out_1 = p.add(CSVWrite( self._tmp_files.get('out_pred_1.csv'))) node_pred_out_2 = p.add(CSVWrite( self._tmp_files.get('out_pred_2.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf1['X_train'] node_split['train1'] > node_clf1['y_train'] node_split['test0'] > node_clf1['X_test'] node_split['train0'] > node_clf2['X_train'] node_split['train1'] > node_clf2['y_train'] node_split['test0'] > node_clf2['X_test'] node_clf1['params_out'] > node_clf2['params_in'] node_clf1['params_out'] > node_params_out_1['input'] node_clf2['params_out'] > node_params_out_2['input'] node_clf1['y_pred'] > node_pred_out_1['input'] node_clf2['y_pred'] > node_pred_out_2['input'] self.run_pipeline(p) params_1 = self._tmp_files.csv_read('out_params_1.csv') params_2 = self._tmp_files.csv_read('out_params_2.csv') self.assertTrue(np.array_equal(params_1, params_2)) y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv') y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv') self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, [ 'output0', ]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields(A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read('out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))
def test_multimetric(self): samples = 150 features = 3 metrics = (VisualMetricSpec( 'sklearn.metrics.precision_recall_curve', # metric 'recall', # output key corresponding to x-axis 'precision', # output key corresponding to y-axis 'Precision/Recall Curve', # graph title 'recall', # x-label 'precision',), # y-label VisualMetricSpec( 'sklearn.metrics.roc_curve', None, ('tpr', 'fpr'), 'ROC Curve', 'Results tagged positive', 'Rate', ('FPR', 'TPR')), NumericMetricSpec( 'sklearn.metrics.roc_auc_score', 'auc', 'ROC AUC Score')) X = np.random.random((samples, features)) y = np.random.randint(0, 2, (samples)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) split_train_test = p.add(SplitTrainTest(2)) np_in_X['output'] > split_train_test['input0'] np_in_y['output'] > split_train_test['input1'] clf = p.add(wrap_and_make_instance(SVC, kernel='linear')) split_train_test['train0'] > clf['X_train'] split_train_test['test0'] > clf['X_test'] split_train_test['train1'] > clf['y_train'] split_train_test['test1'] > clf['y_test'] node_proba_cat_1 = p.add(SplitY(-1)) clf['pred_proba'] > node_proba_cat_1['input'] multi = p.add(Multimetric( metrics, 'SVC', self._tmp_files('report.html'))) node_proba_cat_1['y'] > multi['pred_proba'] split_train_test['test1'] > multi['y_true'] clf['params_out'] > multi['params'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('report.html')))
def test_numpy_write(self): in_data = np.random.rand(10, 10) p = Pipeline() np_in = p.add(NumpyRead(in_data)) np_out = p.add(NumpyWrite()) np_in['output'] > np_out['input'] self.run_pipeline(p) self.assertTrue( np.allclose(in_data, np_sa_to_nd(np_out.get_stage().result)[0]))
def test_generate_feature(self): in_array = np.array([(0.0, 0.1, 0.2, 0.3), (1.0, 1.1, 1.2, 1.3), (2.0, 2.1, 2.2, 2.3)], dtype=[('f0', float), ('f1', float), ('f2', float), ('f3', float)]) ctrl = np.array([(10.4, ), (12.4, ), (14.4, )], dtype=[('f0', float)]) cols = ['f1', 'f3'] f = lambda tab: tab['f1'] + tab['f3'] + 10 p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result)) ctrl = np.array([(1, 10.1), (11, 11.1), (21, 12.1)], dtype=[('times10', float), ('add10', float)]) cols = ['f1'] f = lambda tab: np.array(zip(tab['f1'] * 10, tab['f1'] + 10)) out_col_names = ['times10', 'add10'] p = Pipeline() np_in = p.add(NumpyRead(in_array)) gen_feat = p.add(GenerateFeature(f, cols, out_col_names)) gen_feat(np_in) out = p.add(NumpyWrite()) out(gen_feat) p.run() self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_cross_validation_score(self): rows = 100 folds = 10 X = np.random.random((rows, 10)) y = np.random.randint(0, 2, (rows)) trials = ((SKKFold, {'random_state': 0, 'n_folds': folds}, {'n': rows, 'n_folds': folds, 'random_state': 0}), (StratifiedKFold, {'random_state': 0, 'n_folds': folds}, {'y': y, 'n_folds': folds, 'random_state': 0})) for PartIter, res_kwargs, ctrl_kwargs in trials: p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) cv_score = p.add(CrossValidationScore( wrap(SVC), {}, 'score', wrap(PartIter), res_kwargs)) np_in_X['output'] > cv_score['X_train'] np_in_y['output'] > cv_score['y_train'] score_out = p.add(CSVWrite(self._tmp_files('out.csv'))) cv_score['score'] > score_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv')['f0'] ctrl_kf = PartIter(**ctrl_kwargs) ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf)) self.assertTrue(np.allclose(ctrl, result))
def test_plot_roc(self): # based on # http://scikit-learn.org/stable/auto_examples/plot_roc_crossval.html from sklearn.svm import SVC from sklearn.metrics import roc_curve from sklearn import datasets iris = datasets.load_iris() iris_data = iris.data[iris.target != 2] iris_target = iris.target[iris.target != 2] p = Pipeline() node_data = p.add(NumpyRead(iris_data)) node_target = p.add(NumpyRead(iris_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_select = p.add(SplitY(1)) node_roc = p.add(wrap_and_make_instance(roc_curve)) node_plot = p.add( Plot(self._tmp_files('result.png'), 'co-', title='ROC Curve', xlabel='FPR', ylabel='TPR')) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['pred_proba'] > node_select['input'] node_select['y'] > node_roc['y_score'] node_split['test1'] > node_roc['y_true'] node_roc['fpr'] > node_plot['x'] node_roc['tpr'] > node_plot['y'] self.run_pipeline(p) self.assertTrue(os.path.isfile(self._tmp_files('result.png')))
def test_kfold(self): folds = 3 rows = 6 X = np.random.randint(0, 1000, (rows, 3)) y = np.random.randint(0, 1000, (rows, 1)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) kfold = p.add(KFold(2, folds, random_state=0)) np_in_X['output'] > kfold['input0'] np_in_y['output'] > kfold['input1'] ctrl_kf = SKKFold(rows, n_folds=folds, random_state=0) out_files = [] expected_folds = [] arrays = (X, y) for fold_i, train_test_inds in enumerate(ctrl_kf): for array_i, array in enumerate(arrays): for select_i, selection in enumerate(('train', 'test')): out_key = '{}{}_{}'.format(selection, array_i, fold_i) out_file = out_key + '.csv' out_files.append(out_file) stage = p.add(CSVWrite(self._tmp_files(out_file))) kfold[out_key] > stage['input'] slice_inds = train_test_inds[select_i] expected_folds.append( np_nd_to_sa(arrays[array_i][slice_inds])) self.run_pipeline(p) for out_file, expected_fold in zip(out_files, expected_folds): self.assertTrue( np.array_equal(self._tmp_files.csv_read(out_file), expected_fold))
def test_apply_to_selected_cols(self): rows = 100 cols = 10 random_data = np.random.rand(rows, cols) # enough nans so that there /has/ to be a Nan in 1 of our 3 selected cols nans = 701 with_nans = np.copy(random_data) for r, c in zip(np.random.randint(0, rows, nans), np.random.randint(0, cols, nans)): with_nans[r, c] = np.NaN trials = ((wrap('sklearn.preprocessing.StandardScaler'), (), 'X_train', 'X_new', np_nd_to_sa(random_data)), (FillNA, (0, ), 'input', 'output', np_nd_to_sa(with_nans))) sel_cols = ('f2', 'f3', 'f4') trials = trials[1:] for trans_cls, args, in_key, out_key, in_data in trials: p = Pipeline() node_in = p.add(NumpyRead(in_data)) node_selected = p.add( ApplyToSelectedCols(sel_cols, trans_cls, *args)) node_in['output'] > node_selected[in_key] node_out = p.add(NumpyWrite()) node_selected[out_key] > node_out['input'] node_ctrl_split = p.add(SplitColumns(sel_cols)) node_in['output'] > node_ctrl_split['input'] node_ctrl_trans = p.add(trans_cls(*args)) node_ctrl_split['output'] > node_ctrl_trans[in_key] node_ctrl_out = p.add(NumpyWrite()) node_ctrl_trans[out_key] > node_ctrl_out['input'] self.run_pipeline(p) result = node_out.get_stage().result ctrl = node_ctrl_out.get_stage().result for col in in_data.dtype.names: if col in sel_cols: self.assertTrue(np.allclose(result[col], ctrl[col])) else: self.assertTrue( np.allclose(np.nan_to_num(result[col]), np.nan_to_num(in_data[col])))
def test_identity(self): trials = [(('input0', 'input1'), ('output0', 'output1'), { 'input0': 'output0', 'input1': 'output1' }, True), (('input0', 'input1', 'input2'), ('input0_out', 'input1_out', 'input2_out'), ('input0', 'input1', 'input2'), True), (('input0', 'input1'), ('output0', 'output1'), { 'output0': 'input0', 'output1': 'input1' }, False), (('output0_in', 'output1_in', 'output2_in'), ('output0', 'output1', 'output2'), ('output0', 'output1', 'output2'), False)] for input_keys, output_keys, arg, specify_input in trials: in_data_arrays = [] out_nodes = [] p = Pipeline() if specify_input: node_id = p.add(Identity(arg)) else: node_id = p.add(Identity(output_keys=arg)) for input_key, output_key, in zip(input_keys, output_keys): in_data = np_nd_to_sa(np.random.random((100, 10))) node_in = p.add(NumpyRead(in_data)) node_in['output'] > node_id[input_key] node_out = p.add(NumpyWrite()) node_id[output_key] > node_out['input'] in_data_arrays.append(in_data) out_nodes.append(node_out) self.run_pipeline(p) for in_data, out_node in zip(in_data_arrays, out_nodes): self.assertTrue( np.array_equal(in_data, out_node.get_stage().result))
def test_query_dates(self): p = Pipeline() dates = np.array([(np.datetime64('2012-01-01')), (np.datetime64('2013-04-05')), (np.datetime64('2014-03-11')), (np.datetime64('2015-01-01'))], dtype=[('dt', 'M8[D]')]) inds = np.array([(i, ) for i in xrange(dates.size)], dtype=[('f0', int)]) np_in = p.add(NumpyRead(dates)) q2_node = p.add(Query("dt <= DT('2014-01-01')")) np_in['output'] > q2_node['input'] np_out = p.add(NumpyWrite()) q2_node['output'] > np_out['input'] np_complement = p.add(NumpyWrite()) q2_node['complement'] > np_complement['input'] np_out_inds = p.add(NumpyWrite()) q2_node['output_inds'] > np_out_inds['input'] np_complement_inds = p.add(NumpyWrite()) q2_node['complement_inds'] > np_complement_inds['input'] self.run_pipeline(p) self.assertTrue(np.array_equal(np_out.get_stage().result, dates[:2])) self.assertTrue( np.array_equal(np_complement.get_stage().result, dates[2:])) self.assertTrue( np.array_equal(np_out_inds.get_stage().result, inds[:2])) self.assertTrue( np.array_equal(np_complement_inds.get_stage().result, inds[2:]))
def test_split_by_inds(self): in_data = np.array([(0, 0), (1, 1), (2, 0), (3, 1)], dtype=[('id', int), ('include', int)]) p = Pipeline() np_in = p.add(NumpyRead(in_data)) query = p.add(Query('include != 0')) query(np_in) split_inds = p.add(SplitByInds()) split_inds(np_in, query['output_inds']) out = p.add(NumpyWrite()) out(split_inds) self.run_pipeline(p) ctrl = np.array([(1, 1), (3, 1)], dtype=[('id', int), ('include', int)]) self.assertTrue(np.array_equal(ctrl, out.get_stage().result))
def test_tutorial(self): """ Verifies we can do what sklearn does here: http://scikit-learn.org/stable/tutorial/basic/tutorial.html """ digits = datasets.load_digits() digits_data = digits.data # for now, we need a column vector rather than an array digits_target = digits.target p = Pipeline() # load data from a numpy dataset stage_data = NumpyRead(digits_data) stage_target = NumpyRead(digits_target) # train/test split stage_split_data = SplitTrainTest(2, test_size=1, random_state=0) # build a classifier stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.) # output to a csv stage_csv = CSVWrite(self._tmp_files('out.csv')) node_data, node_target, node_split, node_clf, node_csv = map( p.add, [stage_data, stage_target, stage_split_data, stage_clf, stage_csv]) # connect the pipeline stages together node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['y_pred'] > node_csv['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv', True) # making sure we get the same result as sklearn clf = SVC(gamma=0.001, C=100.) # The tutorial just splits using array slicing, but we need to make # sure that both UPSG and sklearn are splitting the same way, so we # do something more sophisticated train_X, test_X, train_y, test_y = train_test_split(digits_data, digits_target, test_size=1, random_state=0) clf.fit(train_X, np.ravel(train_y)) control = clf.predict(test_X)[0] self.assertAlmostEqual(result, control) # model persistance s = pickle.dumps(stage_clf) stage_clf2 = pickle.loads(s) self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def __metric_pipeline(self, metric, params={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) metric_stage = wrap_and_make_instance(metric, **params) in_keys = metric_stage.input_keys out_keys = metric_stage.output_keys p = Pipeline() node_X_in = p.add(NumpyRead(X_in)) node_y_in = p.add(NumpyRead(y_in)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_X_in['output'] > node_split['input0'] node_y_in['output'] > node_split['input1'] ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] ctrl_clf = SVC(random_state=0, probability=True) ctrl_clf.fit(ctrl_X_train, ctrl_y_train) node_proba_1 = p.add(SplitY(1)) node_clf['pred_proba'] > node_proba_1['input'] ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1] node_metric = p.add(metric_stage) ctrl_metric_args = {} if 'y_true' in in_keys: node_split['test1'] > node_metric['y_true'] ctrl_metric_args['y_true'] = ctrl_y_test if 'y_score' in in_keys: node_proba_1['y'] > node_metric['y_score'] ctrl_metric_args['y_score'] = ctrl_y_score if 'probas_pred' in in_keys: node_proba_1['y'] > node_metric['probas_pred'] ctrl_metric_args['probas_pred'] = ctrl_y_score out_nodes = [ p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(out_key)))) for out_key in out_keys ] [ node_metric[out_key] > out_nodes[i]['input'] for i, out_key in enumerate(out_keys) ] self.run_pipeline(p) ctrl_returns = metric(**ctrl_metric_args) if len(out_keys) == 1: ctrl_returns = (ctrl_returns, ) for i, out_key in enumerate(out_keys): control = ctrl_returns[i] result = self._tmp_files.csv_read('out_{}.csv'.format(out_key), as_nd=True) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def __simple_pipeline(self, sk_cls, sk_method_name, upsg_out_key, init_kwargs={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) ctrl_sk_inst = sk_cls(**init_kwargs) est_params = ctrl_sk_inst.get_params() try: random_state = est_params['random_state'] if random_state is None: # This has to be fixed. Set a state and try again init_kwargs['random_state'] = 0 ctrl_sk_inst = sk_cls(**init_kwargs) except KeyError: pass p = Pipeline() sk_stage = p.add(wrap_and_make_instance(sk_cls, **init_kwargs)) X_in_stage = p.add(NumpyRead(X_in)) y_in_stage = p.add(NumpyRead(y_in)) if sk_method_name == 'predict': train_test = p.add(SplitTrainTest(2, random_state=0)) X_in_stage['output'] > train_test['input0'] y_in_stage['output'] > train_test['input1'] input_keys = sk_stage.get_stage().input_keys if 'X_train' in input_keys: train_test['train0'] > sk_stage['X_train'] if 'X_test' in input_keys: train_test['test0'] > sk_stage['X_test'] if 'y_train' in input_keys: train_test['train1'] > sk_stage['y_train'] else: X_in_stage['output'] > sk_stage['X_train'] y_in_stage['output'] > sk_stage['y_train'] csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) sk_stage[upsg_out_key] > csv_out['input'] self.run_pipeline(p) if sk_method_name == 'predict': ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train) control = ctrl_sk_inst.predict(ctrl_X_test) else: control = ctrl_sk_inst.fit_transform(X_in, y_in) result = self._tmp_files.csv_read('out.csv', as_nd=True) if result.ndim != control.ndim and result.ndim == 1: result = result.reshape(result.size, 1) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def test_wrap_cross_validation(self): X = np.array([(0, 2001, 12.31), (1, 1999, 14.32), (2, 1999, 120.76), (3, 2002, 32.12), (4, 2004, 98.64), (5, 2005, 32.21), (6, 2002, 100.23), (7, 2006, 123.40), (8, 2000, 72.21)], dtype=[('id', int), ('year', int), ('fine', float)]) y = np.array([(0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, ), (1, ), (0, )], dtype=[('category', int)]) ctrl_inds = [([1, 2, 8], [0, 3, 6]), ([0, 3, 6], [4]), ([4], [5, 7])] p = Pipeline() node_X_in = p.add(NumpyRead(X)) node_y_in = p.add(NumpyRead(y)) node_just_time = p.add(SplitColumns(['year'])) node_just_time(node_X_in) training_windows = by_window_ranges(1999, 2000, 2004, 2) testing_windows = by_window_ranges(2001, 2002, 2006, 2) mode = ByWindowMode.SLIDING node_cv = p.add( wrap_and_make_instance( 'upsg.transform.partition_iterators.ByWindow', n_arrays=2, training_windows=training_windows, testing_windows=testing_windows, mode=ByWindowMode.SLIDING)) node_cv(input0=node_X_in, input1=node_y_in, y=node_just_time) self.assertEqual(len(node_cv.output_keys), 2 * 2 * len(ctrl_inds)) out_nodes = [] for i in xrange(len(ctrl_inds)): train_node_X = p.add(NumpyWrite()) train_node_X(node_cv['train0_{}'.format(i)]) train_node_y = p.add(NumpyWrite()) train_node_y(node_cv['train1_{}'.format(i)]) test_node_X = p.add(NumpyWrite()) test_node_X(node_cv['test0_{}'.format(i)]) test_node_y = p.add(NumpyWrite()) test_node_y(node_cv['test1_{}'.format(i)]) out_nodes.append( (train_node_X, train_node_y, test_node_X, test_node_y)) p.run() for i, (train_node_X, train_node_y, test_node_X, test_node_y) in \ enumerate(out_nodes): self.assertTrue( np.array_equal(train_node_X.get_stage().result, X[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(train_node_y.get_stage().result, y[ctrl_inds[i][0]])) self.assertTrue( np.array_equal(test_node_X.get_stage().result, X[ctrl_inds[i][1]])) self.assertTrue( np.array_equal(test_node_y.get_stage().result, y[ctrl_inds[i][1]]))