def test_query_complex(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('query.csv'))) q1_node = p.add( Query("((id == value) and not (use_this_col == 'no'))" "or name == 'fish'")) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv'))) csv_in['output'] > q1_node['input'] q1_node['output'] > csv_out['input'] q1_node['complement'] > csv_comp['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('query_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_comp.csv') ctrl = csv_read(path_of_data('query_ctrl_comp.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_3_stage(self): from sklearn.preprocessing import Imputer infile_name = path_of_data('missing_vals.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) impute_node = p.add(wrap_and_make_instance(Imputer)) csv_read_node['output'] > impute_node['X_train'] impute_node['X_new'] > csv_write_node['input'] self.run_pipeline(p) ctrl_imputer = Imputer() ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) num_type = ctrl_X_sa[0][0].dtype ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa) ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd) control = ctrl_X_new_nd result = self._tmp_files.csv_read('out.csv', True) self.assertTrue(np.allclose(result, control))
def test_fill_na(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv'))) fill_na = p.add(FillNA(-1)) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > fill_na['input'] fill_na['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_label_encode(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('categories.csv'))) le = p.add(LabelEncode()) csv_out = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_in['output'] > le['input'] le['output'] > csv_out['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out.csv') ctrl = csv_read( path_of_data('test_transform_test_label_encode_ctrl.csv')) self.assertTrue(np.array_equal(result, ctrl))
def test_rename_cols(self): infile_name = path_of_data('mixed_csv.csv') rename_dict = {'name': 'designation', 'height': 'tallness'} p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) trans_node = p.add(RenameCols(rename_dict)) csv_write_node = p.add(CSVWrite(self._tmp_files('out.csv'))) csv_read_node['output'] > trans_node['input'] trans_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = {'id', 'designation', 'tallness'} result = set(self._tmp_files.csv_read('out.csv').dtype.names) self.assertTrue(np.array_equal(result, control))
def test_rw(self): infile_name = path_of_data('mixed_csv.csv') p = Pipeline() csv_read_node = p.add(CSVRead(infile_name)) csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv'))) csv_read_node['output'] > csv_write_node['input'] self.run_pipeline(p) control = np.genfromtxt(infile_name, dtype=None, delimiter=",", names=True) result = self._tmp_files.csv_read('out.csv') self.assertTrue(np.array_equal(result, control))
def test_timify(self): in_file = path_of_data('with_dates.csv') p = Pipeline() csv_in = p.add(CSVRead(in_file)) timify = p.add(Timify()) csv_in['output'] > timify['input'] np_out = p.add(NumpyWrite()) timify['output'] > np_out['input'] self.run_pipeline(p) result = np_out.get_stage().result ctrl_raw = csv_read(in_file) ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else (name, fmt) for name, fmt in ctrl_raw.dtype.descr]) ctrl_better = csv_read(in_file, dtype=ctrl_dtype) self.assertEqual(result.dtype, ctrl_better.dtype) self.assertTrue(np.array_equal(result, ctrl_better))
def test_split_columns(self): p = Pipeline() csv_in = p.add(CSVRead(path_of_data('numbers.csv'))) split = p.add(SplitColumns(('F1', 'F3'))) csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv'))) csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv'))) csv_in['output'] > split['input'] split['output'] > csv_out_sel['input'] split['complement'] > csv_out_rest['input'] self.run_pipeline(p) result = self._tmp_files.csv_read('out_sel.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv')) self.assertTrue(np.array_equal(result, ctrl)) result = self._tmp_files.csv_read('out_rest.csv') ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv')) self.assertTrue(np.array_equal(result, ctrl))