Ejemplo n.º 1
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name, dtype=None, delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
Ejemplo n.º 2
0
    def test_3_stage(self):
        from sklearn.preprocessing import Imputer

        infile_name = path_of_data('missing_vals.csv')

        p = Pipeline()

        csv_read_node = p.add(CSVRead(infile_name))
        csv_write_node = p.add(CSVWrite(self._tmp_files.get('out.csv')))
        impute_node = p.add(wrap_and_make_instance(Imputer))

        csv_read_node['output'] > impute_node['X_train']
        impute_node['X_new'] > csv_write_node['input']

        self.run_pipeline(p)

        ctrl_imputer = Imputer()
        ctrl_X_sa = np.genfromtxt(infile_name,
                                  dtype=None,
                                  delimiter=",",
                                  names=True)
        num_type = ctrl_X_sa[0][0].dtype
        ctrl_X_nd, ctrl_X_sa_type = np_sa_to_nd(ctrl_X_sa)
        ctrl_X_new_nd = ctrl_imputer.fit_transform(ctrl_X_nd)
        control = ctrl_X_new_nd

        result = self._tmp_files.csv_read('out.csv', True)

        self.assertTrue(np.allclose(result, control))
Ejemplo n.º 3
0
 def __process_in_data(self, in_data):
     if in_data is None:
         return (np.random.random((100, 10)), np.random.randint(0, 2, 100))
     elif isinstance(in_data, str) and in_data.split('.')[-1] == 'csv':
         a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0]
         return (a[:, :-1], a[:, -1])
     # assume in_data is a tuple (X, y)
     return (in_data[0], in_data[1])
Ejemplo n.º 4
0
 def __process_in_data(self, in_data):
     if in_data is None:
         return (np.random.random((100, 10)), np.random.randint(0, 2, 100))
     elif isinstance(in_data, str) and in_data.split(".")[-1] == "csv":
         a = np_sa_to_nd(csv_read(path_of_data(in_data)))[0]
         return (a[:, :-1], a[:, -1])
     # assume in_data is a tuple (X, y)
     return (in_data[0], in_data[1])
Ejemplo n.º 5
0
    def test_lambda(self):

        # Test output key generation

        l1 = LambdaStage(lambda x, y: 0)
        self.assertEqual(l1.input_keys, ['x', 'y'])
        self.assertEqual(l1.output_keys, [
            'output0',
        ])

        l2 = LambdaStage(lambda: 0, n_outputs=3)
        self.assertEqual(l2.input_keys, [])
        self.assertEqual(l2.output_keys,
                         ['output{}'.format(i) for i in xrange(3)])

        # Test running in pipeline

        in_data = np_nd_to_sa(np.random.random((100, 10)))
        scale = np_nd_to_sa(np.array(3))
        out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col']

        def log1_sqrt2_scale3(A, scale):
            names = A.dtype.names
            log_col = np.log(A[names[0]])
            sqrt_col = np.sqrt(A[names[1]])
            scale_col = A[names[2]] * scale[0][0]

            return (append_fields(A, ['log1', 'sqrt2', 'scale3'],
                                  (log_col, sqrt_col, scale_col)), log_col,
                    sqrt_col, scale_col)

        p = Pipeline()

        np_in = p.add(NumpyRead(in_data))
        scale_in = p.add(NumpyRead(scale))

        lambda_stage = p.add(LambdaStage(log1_sqrt2_scale3, out_keys))
        np_in['output'] > lambda_stage['A']
        scale_in['output'] > lambda_stage['scale']

        csv_out_stages = []
        for key in out_keys:
            stage = p.add(CSVWrite(self._tmp_files('out_{}.csv'.format(key))))
            csv_out_stages.append(stage)
            lambda_stage[key] > stage['input']

        self.run_pipeline(p)

        controls = log1_sqrt2_scale3(in_data, scale)

        for i, key in enumerate(out_keys):
            control = controls[i]
            if is_sa(control):
                control = np_sa_to_nd(control)[0]
            result = self._tmp_files.csv_read('out_{}.csv'.format(key),
                                              as_nd=True)
            self.assertTrue(np.allclose(control, result))
Ejemplo n.º 6
0
 def test_numpy_write(self):
     in_data = np.random.rand(10, 10)
     p = Pipeline()
     np_in = p.add(NumpyRead(in_data))
     np_out = p.add(NumpyWrite())
     np_in['output'] > np_out['input']
     self.run_pipeline(p)
     self.assertTrue(
         np.allclose(in_data,
                     np_sa_to_nd(np_out.get_stage().result)[0]))
Ejemplo n.º 7
0
 def test_numpy_write(self): 
     in_data = np.random.rand(10,10)
     p = Pipeline()
     np_in = p.add(NumpyRead(in_data))
     np_out = p.add(NumpyWrite())
     np_in['output'] > np_out['input']
     self.run_pipeline(p)
     self.assertTrue(np.allclose(
         in_data, 
         np_sa_to_nd(np_out.get_stage().result)[0]))
Ejemplo n.º 8
0
    def test_lambda(self):

        # Test output key generation

        l1 = LambdaStage(lambda x, y: 0)
        self.assertEqual(l1.input_keys, ['x', 'y'])
        self.assertEqual(l1.output_keys, ['output0',])

        l2 = LambdaStage(lambda: 0, n_outputs=3)
        self.assertEqual(l2.input_keys, [])
        self.assertEqual(l2.output_keys, ['output{}'.format(i) for i in
                                          xrange(3)])

        # Test running in pipeline

        in_data = np_nd_to_sa(np.random.random((100, 10)))
        scale = np_nd_to_sa(np.array(3))
        out_keys = ['augmented', 'log_col', 'sqrt_col', 'scale_col'] 

        def log1_sqrt2_scale3(A, scale):
            names = A.dtype.names
            log_col = np.log(A[names[0]])
            sqrt_col = np.sqrt(A[names[1]])
            scale_col = A[names[2]] * scale[0][0]

            return (append_fields(
                        A, 
                        ['log1', 'sqrt2', 'scale3'], 
                        (log_col, sqrt_col, scale_col)),
                    log_col,
                    sqrt_col,
                    scale_col)

        p = Pipeline()

        np_in = p.add(NumpyRead(in_data))
        scale_in = p.add(NumpyRead(scale))

        lambda_stage = p.add(
            LambdaStage(
                log1_sqrt2_scale3, 
                out_keys))
        np_in['output'] > lambda_stage['A']
        scale_in['output'] > lambda_stage['scale']

        csv_out_stages = []
        for key in out_keys:
            stage = p.add(
                    CSVWrite(
                        self._tmp_files(
                            'out_{}.csv'.format(key))))
            csv_out_stages.append(stage)
            lambda_stage[key] > stage['input']

        self.run_pipeline(p)

        controls = log1_sqrt2_scale3(in_data, scale)

        for i, key in enumerate(out_keys):
            control = controls[i]
            if is_sa(control):
                control = np_sa_to_nd(control)[0]
            result = self._tmp_files.csv_read(
                        'out_{}.csv'.format(key), 
                        as_nd=True)
            self.assertTrue(np.allclose(control, result))
Ejemplo n.º 9
0
def csv_read(filename, as_nd=False, dtype=None):
    sa = np.genfromtxt(filename, dtype=dtype, delimiter=",", names=True)
    if as_nd:
        return np_sa_to_nd(sa)[0]
    return sa