Exemple #1
0
    def test_query_complex(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('query.csv')))
        q1_node = p.add(
            Query("((id == value) and not (use_this_col == 'no'))"
                  "or name == 'fish'"))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))
        csv_comp = p.add(CSVWrite(self._tmp_files('out_comp.csv')))

        csv_in['output'] > q1_node['input']
        q1_node['output'] > csv_out['input']
        q1_node['complement'] > csv_comp['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('query_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_comp.csv')
        ctrl = csv_read(path_of_data('query_ctrl_comp.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Exemple #2
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name,
                                  dtype=None,
                                  delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
Exemple #3
0
    def test_fill_na(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('missing_vals_mixed.csv')))
        fill_na = p.add(FillNA(-1))
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > fill_na['input']
        fill_na['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(path_of_data('test_transform_test_fill_na_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Exemple #4
0
    def test_label_encode(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('categories.csv')))
        le = p.add(LabelEncode())
        csv_out = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_in['output'] > le['input']
        le['output'] > csv_out['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out.csv')
        ctrl = csv_read(
            path_of_data('test_transform_test_label_encode_ctrl.csv'))

        self.assertTrue(np.array_equal(result, ctrl))
Exemple #5
0
    def test_rename_cols(self):
        infile_name = path_of_data('mixed_csv.csv')
        rename_dict = {'name': 'designation', 'height': 'tallness'}

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        trans_node = p.add(RenameCols(rename_dict))
        csv_write_node = p.add(CSVWrite(self._tmp_files('out.csv')))

        csv_read_node['output'] > trans_node['input']
        trans_node['output'] > csv_write_node['input']

        self.run_pipeline(p)

        control = {'id', 'designation', 'tallness'}
        result = set(self._tmp_files.csv_read('out.csv').dtype.names)

        self.assertTrue(np.array_equal(result, control))
Exemple #6
0
    def test_rw(self):
        infile_name = path_of_data('mixed_csv.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))

        csv_read_node['output'] > csv_write_node['input']

        self.run_pipeline(p)

        control = np.genfromtxt(infile_name,
                                dtype=None,
                                delimiter=",",
                                names=True)
        result = self._tmp_files.csv_read('out.csv')

        self.assertTrue(np.array_equal(result, control))
Exemple #7
0
    def test_timify(self):
        in_file = path_of_data('with_dates.csv')

        p = Pipeline()

        csv_in = p.add(CSVRead(in_file))

        timify = p.add(Timify())
        csv_in['output'] > timify['input']

        np_out = p.add(NumpyWrite())
        timify['output'] > np_out['input']

        self.run_pipeline(p)
        result = np_out.get_stage().result

        ctrl_raw = csv_read(in_file)
        ctrl_dtype = np.dtype([(name, '<M8[D]') if 'dt' in name else
                               (name, fmt)
                               for name, fmt in ctrl_raw.dtype.descr])
        ctrl_better = csv_read(in_file, dtype=ctrl_dtype)

        self.assertEqual(result.dtype, ctrl_better.dtype)
        self.assertTrue(np.array_equal(result, ctrl_better))
Exemple #8
0
    def test_split_columns(self):

        p = Pipeline()

        csv_in = p.add(CSVRead(path_of_data('numbers.csv')))
        split = p.add(SplitColumns(('F1', 'F3')))
        csv_out_sel = p.add(CSVWrite(self._tmp_files('out_sel.csv')))
        csv_out_rest = p.add(CSVWrite(self._tmp_files('out_rest.csv')))

        csv_in['output'] > split['input']
        split['output'] > csv_out_sel['input']
        split['complement'] > csv_out_rest['input']

        self.run_pipeline(p)

        result = self._tmp_files.csv_read('out_sel.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_selected.csv'))

        self.assertTrue(np.array_equal(result, ctrl))

        result = self._tmp_files.csv_read('out_rest.csv')
        ctrl = csv_read(path_of_data('test_split_columns_ctrl_rest.csv'))

        self.assertTrue(np.array_equal(result, ctrl))