def test_moving_params(self): digits = datasets.load_digits() digits_data = digits.data digits_target = digits.target p = Pipeline() node_data = p.add(NumpyRead(digits_data)) node_target = p.add(NumpyRead(digits_target)) node_split = p.add(SplitTrainTest(2, random_state=0)) # parameters from # http://scikit-learn.org/stable/auto_examples/plot_classifier_comparison.html node_clf1 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=5, n_estimators=10, max_features=1, random_state=0)) node_clf2 = p.add( wrap_and_make_instance(RandomForestClassifier, max_depth=12, n_estimators=100, max_features=1000)) node_params_out_1 = p.add( CSVWrite(self._tmp_files.get('out_params_1.csv'))) node_params_out_2 = p.add( CSVWrite(self._tmp_files.get('out_params_2.csv'))) node_pred_out_1 = p.add(CSVWrite( self._tmp_files.get('out_pred_1.csv'))) node_pred_out_2 = p.add(CSVWrite( self._tmp_files.get('out_pred_2.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf1['X_train'] node_split['train1'] > node_clf1['y_train'] node_split['test0'] > node_clf1['X_test'] node_split['train0'] > node_clf2['X_train'] node_split['train1'] > node_clf2['y_train'] node_split['test0'] > node_clf2['X_test'] node_clf1['params_out'] > node_clf2['params_in'] node_clf1['params_out'] > node_params_out_1['input'] node_clf2['params_out'] > node_params_out_2['input'] node_clf1['y_pred'] > node_pred_out_1['input'] node_clf2['y_pred'] > node_pred_out_2['input'] self.run_pipeline(p) params_1 = self._tmp_files.csv_read('out_params_1.csv') params_2 = self._tmp_files.csv_read('out_params_2.csv') self.assertTrue(np.array_equal(params_1, params_2)) y_pred_1 = self._tmp_files.csv_read('out_pred_1.csv') y_pred_2 = self._tmp_files.csv_read('out_pred_2.csv') self.assertTrue(np.array_equal(y_pred_1, y_pred_2))
def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add( Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_cross_validation_score(self): rows = 100 folds = 10 X = np.random.random((rows, 10)) y = np.random.randint(0, 2, (rows)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) cv_score = p.add( CrossValidationScore(wrap(SVC), 'score', {}, folds, random_state=0)) np_in_X['output'] > cv_score['X_train'] np_in_y['output'] > cv_score['y_train'] score_out = p.add(CSVWrite(self._tmp_files('out.csv'))) cv_score['score'] > score_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv')['f0'] ctrl_kf = SKKFold(rows, folds, random_state=0) ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf)) self.assertTrue(np.allclose(ctrl, result))
def test_grid_search(self): """ Simulates behavior of example in: http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html#sklearn.grid_search.GridSearchCV """ folds = 2 parameters = { 'kernel': ( 'rbf', 'linear'), 'C': [ 1, 10, 100], 'random_state': [0]} iris = datasets.load_iris() iris_data = iris.data iris_target = iris.target p = Pipeline() node_data = p.add(NumpyRead(iris_data)) node_target = p.add(NumpyRead(iris_target)) node_split = p.add(SplitTrainTest(2, random_state=1)) node_search = p.add(GridSearch( wrap(SVC), parameters, 'score', cv_stage_kwargs={'n_folds': folds})) node_params_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_search['X_train'] node_split['train1'] > node_search['y_train'] node_split['test0'] > node_search['X_test'] node_split['test1'] > node_search['y_test'] node_search['params_out'] > node_params_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl_X_train, _, ctrl_y_train, _ = train_test_split( iris_data, iris_target, random_state=1) ctrl_cv = SKKFold(ctrl_y_train.size, folds) ctrl_search = grid_search.GridSearchCV(SVC(), parameters, cv=ctrl_cv) ctrl_search.fit(ctrl_X_train, ctrl_y_train) control = ctrl_search.best_params_ # TODO a number of configurations tie here, and sklearn picks a different # best configuration than upsg does (although they have the same score) # ideally, we want to find some parameters where there is a clear # winner control = {'C': 10, 'kernel': 'linear', 'random_state': 0} self.assertEqual(np_sa_to_dict(np.array([result])), control)
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def test_lambda(self): # Test output key generation l1 = LambdaStage(lambda x, y: 0) self.assertEqual(l1.input_keys, ['x', 'y']) self.assertEqual(l1.output_keys, [ 'output0', ]) l2 = LambdaStage(lambda: 0, n_outputs=3) self.assertEqual(l2.input_keys, []) self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in xrange(3)]) # Test running in pipeline in_data = np_nd_to_sa(np.random.random((100, 10))) scale = np_nd_to_sa(np.array(3)) out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] def log1_sqrt2_scale3(A, scale): names = A.dtype.names log_col = np.log(A[names[0]]) sqrt_col = np.sqrt(A[names[1]]) scale_col = A[names[2]] * scale[0][0] return (append_fields(A, ['log1', 'sqrt2', 'scale3'], (log_col, sqrt_col, scale_col)), log_col, sqrt_col, scale_col) p = Pipeline() np_in = p.add(NumpyRead(in_data)) scale_in = p.add(NumpyRead(scale)) lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys)) np_in['output'] > lambda_stage['A'] scale_in['output'] > lambda_stage['scale'] csv_out_stages = [] for key in out_keys: stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key)))) csv_out_stages.append(stage) lambda_stage[key] > stage['input'] self.run_pipeline(p) controls = log1_sqrt2_scale3(in_data, scale) for i, key in enumerate(out_keys): control = controls[i] if is_sa(control): control = np_sa_to_nd(control)[0] result = self._tmp_files.csv_read('out_{}.csv'.format(key), as_nd=True) self.assertTrue(np.allclose(control, result))
def test_sql(self): # Make sure we don't accidentally corrupt our test database db_path, db_file_name = self._tmp_files.tmp_copy( path_of_data('small.db')) db_url = 'sqlite:///{}'.format(db_path) q_sel_employees = 'CREATE TABLE {tmp_emp} AS SELECT * FROM employees;' # We have to be careful about the datetime type in sqlite3. It will # forget if we don't keep reminding it, and if it forgets sqlalchemy # will be unhappy. Hence, we can't use CREATE TABLE AS if our table # has a DATETIME q_sel_hours = ('CREATE TABLE {tmp_hrs} ' '(id INT, employee_id INT, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {tmp_hrs} SELECT * FROM hours;') q_join = ('CREATE TABLE {joined} ' '(id INT, last_name TEXT, salary REAL, time DATETIME, ' ' event_type TEXT); ' 'INSERT INTO {joined} ' 'SELECT {tmp_emp}.id, last_name, salary, time, event_type ' 'FROM {tmp_emp} JOIN {tmp_hrs} ON ' '{tmp_emp}.id = {tmp_hrs}.employee_id;') p = Pipeline() get_emp = p.add(RunSQL(db_url, q_sel_employees, [], ['tmp_emp'], {})) get_hrs = p.add(RunSQL(db_url, q_sel_hours, [], ['tmp_hrs'], {})) join = p.add( RunSQL(db_url, q_join, ['tmp_emp', 'tmp_hrs'], ['joined'], {})) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) get_emp['tmp_emp'] > join['tmp_emp'] get_hrs['tmp_hrs'] > join['tmp_hrs'] join['joined'] > csv_out['input'] self.run_pipeline(p) ctrl = csv_read(path_of_data('test_transform_test_sql_ctrl.csv')) result = self._tmp_files.csv_read('out.csv') # Because Numpy insists on printing times with local offsets, but # not every computer has the same offset, we have to force it back # into UTC for i, dt in enumerate(result['time']): # .item() makes a datetime, which we can format correctly later # http://stackoverflow.com/questions/25134639/how-to-force-python-print-numpy-datetime64-with-specified-timezone result['time'][i] = np.datetime64(dt).item().strftime( '%Y-%m-%dT%H:%M:%S') # Then we have to make the string field smaller new_cols = [] for col in result.dtype.names: new_cols.append(result[col].astype(ctrl.dtype[col])) result = merge_arrays(new_cols, flatten=True) result.dtype.names = ctrl.dtype.names self.assertTrue(np.array_equal(result, ctrl))
def test_split_columns(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('numbers.csv'))) split = p.add(SplitColumns(('F1', 'F3'))) csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv'))) csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv'))) csv_in['output'] > split['input'] split['output'] > csv_out_sel['input'] split['complement'] > csv_out_rest['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out_sel.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_rest.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_fill_na(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv'))) fill_na = p.add(FillNA(-1)) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > fill_na['input'] fill_na['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read( path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_rename_cols(self): infile_name = path_of_data('mixed_csv.csv') rename_dict = {'name': 'designation', 'height': 'tallness'} p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) trans_node = p.add(RenameCols(rename_dict)) csv_write_node = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_read_node['output'] > trans_node['input'] trans_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = {'id', 'designation', 'tallness'} result = set(self._tmp_files.csv_read('out.csv').dtype.names) self.assertTrue(np.array_equal(result, control))
def test_rw(self): infile_name = path_of_data('mixed_csv.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) csv_read_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) result = self._tmp_files.csv_read('out.csv') self.assertTrue(np.array_equal(result, control))
def test_cross_validation_score(self): rows = 100 folds = 10 X = np.random.random((rows, 10)) y = np.random.randint(0, 2, (rows)) trials = ((SKKFold, {'random_state': 0, 'n_folds': folds}, {'n': rows, 'n_folds': folds, 'random_state': 0}), (StratifiedKFold, {'random_state': 0, 'n_folds': folds}, {'y': y, 'n_folds': folds, 'random_state': 0})) for PartIter, res_kwargs, ctrl_kwargs in trials: p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) cv_score = p.add(CrossValidationScore( wrap(SVC), {}, 'score', wrap(PartIter), res_kwargs)) np_in_X['output'] > cv_score['X_train'] np_in_y['output'] > cv_score['y_train'] score_out = p.add(CSVWrite(self._tmp_files('out.csv'))) cv_score['score'] > score_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv')['f0'] ctrl_kf = PartIter(**ctrl_kwargs) ctrl = np.mean(cross_val_score(SVC(), X, y, cv=ctrl_kf)) self.assertTrue(np.allclose(ctrl, result))
def test_kfold(self): folds = 3 rows = 6 X = np.random.randint(0, 1000, (rows, 3)) y = np.random.randint(0, 1000, (rows, 1)) p = Pipeline() np_in_X = p.add(NumpyRead(X)) np_in_y = p.add(NumpyRead(y)) kfold = p.add(KFold(2, folds, random_state=0)) np_in_X['output'] > kfold['input0'] np_in_y['output'] > kfold['input1'] ctrl_kf = SKKFold(rows, n_folds=folds, random_state=0) out_files = [] expected_folds = [] arrays = (X, y) for fold_i, train_test_inds in enumerate(ctrl_kf): for array_i, array in enumerate(arrays): for select_i, selection in enumerate(('train', 'test')): out_key = '{}{}_{}'.format(selection, array_i, fold_i) out_file = out_key + '.csv' out_files.append(out_file) stage = p.add(CSVWrite(self._tmp_files(out_file))) kfold[out_key] > stage['input'] slice_inds = train_test_inds[select_i] expected_folds.append( np_nd_to_sa(arrays[array_i][slice_inds])) self.run_pipeline(p) for out_file, expected_fold in zip(out_files, expected_folds): self.assertTrue( np.array_equal(self._tmp_files.csv_read(out_file), expected_fold))
def test_tutorial(self): """ Verifies we can do what sklearn does here: http://scikit-learn.org/stable/tutorial/basic/tutorial.html """ digits = datasets.load_digits() digits_data = digits.data # for now, we need a column vector rather than an array digits_target = digits.target p = Pipeline() # load data from a numpy dataset stage_data = NumpyRead(digits_data) stage_target = NumpyRead(digits_target) # train/test split stage_split_data = SplitTrainTest(2, test_size=1, random_state=0) # build a classifier stage_clf = wrap_and_make_instance(SVC, gamma=0.001, C=100.) # output to a csv stage_csv = CSVWrite(self._tmp_files('out.csv')) node_data, node_target, node_split, node_clf, node_csv = map( p.add, [stage_data, stage_target, stage_split_data, stage_clf, stage_csv]) # connect the pipeline stages together node_data['output'] > node_split['input0'] node_target['output'] > node_split['input1'] node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] node_clf['y_pred'] > node_csv['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv', True) # making sure we get the same result as sklearn clf = SVC(gamma=0.001, C=100.) # The tutorial just splits using array slicing, but we need to make # sure that both UPSG and sklearn are splitting the same way, so we # do something more sophisticated train_X, test_X, train_y, test_y = train_test_split(digits_data, digits_target, test_size=1, random_state=0) clf.fit(train_X, np.ravel(train_y)) control = clf.predict(test_X)[0] self.assertAlmostEqual(result, control) # model persistance s = pickle.dumps(stage_clf) stage_clf2 = pickle.loads(s) self.assertEqual(stage_clf.get_params(), stage_clf2.get_params())
def __simple_pipeline(self, sk_cls, sk_method_name, upsg_out_key, init_kwargs={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) ctrl_sk_inst = sk_cls(**init_kwargs) est_params = ctrl_sk_inst.get_params() try: random_state = est_params['random_state'] if random_state is None: # This has to be fixed. Set a state and try again init_kwargs['random_state'] = 0 ctrl_sk_inst = sk_cls(**init_kwargs) except KeyError: pass p = Pipeline() sk_stage = p.add(wrap_and_make_instance(sk_cls, **init_kwargs)) X_in_stage = p.add(NumpyRead(X_in)) y_in_stage = p.add(NumpyRead(y_in)) if sk_method_name == 'predict': train_test = p.add(SplitTrainTest(2, random_state=0)) X_in_stage['output'] > train_test['input0'] y_in_stage['output'] > train_test['input1'] input_keys = sk_stage.get_stage().input_keys if 'X_train' in input_keys: train_test['train0'] > sk_stage['X_train'] if 'X_test' in input_keys: train_test['test0'] > sk_stage['X_test'] if 'y_train' in input_keys: train_test['train1'] > sk_stage['y_train'] else: X_in_stage['output'] > sk_stage['X_train'] y_in_stage['output'] > sk_stage['y_train'] csv_out = p.add(CSVWrite(self._tmp_files.get('out.csv'))) sk_stage[upsg_out_key] > csv_out['input'] self.run_pipeline(p) if sk_method_name == 'predict': ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) ctrl_sk_inst.fit(ctrl_X_train, ctrl_y_train) control = ctrl_sk_inst.predict(ctrl_X_test) else: control = ctrl_sk_inst.fit_transform(X_in, y_in) result = self._tmp_files.csv_read('out.csv', as_nd=True) if result.ndim != control.ndim and result.ndim == 1: result = result.reshape(result.size, 1) self.assertTrue(result.shape == control.shape and np.allclose(result, control))
def __metric_pipeline(self, metric, params={}, in_data=None): X_in, y_in = self.__process_in_data(in_data) metric_stage = wrap_and_make_instance(metric, **params) in_keys = metric_stage.input_keys out_keys = metric_stage.output_keys p = Pipeline() node_X_in = p.add(NumpyRead(X_in)) node_y_in = p.add(NumpyRead(y_in)) node_split = p.add(SplitTrainTest(2, random_state=0)) node_X_in['output'] > node_split['input0'] node_y_in['output'] > node_split['input1'] ctrl_X_train, ctrl_X_test, ctrl_y_train, ctrl_y_test = ( train_test_split(X_in, y_in, random_state=0)) node_clf = p.add(wrap_and_make_instance(SVC, random_state=0)) node_split['train0'] > node_clf['X_train'] node_split['train1'] > node_clf['y_train'] node_split['test0'] > node_clf['X_test'] ctrl_clf = SVC(random_state=0, probability=True) ctrl_clf.fit(ctrl_X_train, ctrl_y_train) node_proba_1 = p.add(SplitY(1)) node_clf['pred_proba'] > node_proba_1['input'] ctrl_y_score = ctrl_clf.predict_proba(ctrl_X_test)[:, 1] node_metric = p.add(metric_stage) ctrl_metric_args = {} if 'y_true' in in_keys: node_split['test1'] > node_metric['y_true'] ctrl_metric_args['y_true'] = ctrl_y_test if 'y_score' in in_keys: node_proba_1['y'] > node_metric['y_score'] ctrl_metric_args['y_score'] = ctrl_y_score if 'probas_pred' in in_keys: node_proba_1['y'] > node_metric['probas_pred'] ctrl_metric_args['probas_pred'] = ctrl_y_score out_nodes = [ p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(out_key)))) for out_key in out_keys ] [ node_metric[out_key] > out_nodes[i]['input'] for i, out_key in enumerate(out_keys) ] self.run_pipeline(p) ctrl_returns = metric(**ctrl_metric_args) if len(out_keys) == 1: ctrl_returns = (ctrl_returns, ) for i, out_key in enumerate(out_keys): control = ctrl_returns[i] result = self._tmp_files.csv_read('out_{}.csv'.format(out_key), as_nd=True) self.assertTrue(result.shape == control.shape and np.allclose(result, control))