Ejemplo n.º 1
0
    def test_errors(self):
        with self.assertRaises(ValueError):
            trans = n.CatTransform(norm_mode='whatever', name='cat')

        with self.assertRaises(ValueError):
            trans = n.CatTransform(norm_mode='min_max', name='cat')

        trans = n.CatTransform(
            name='cat',
            norm_mode='mean_std',
        )
        with self.assertRaises(AssertionError):
            trans.pour(np.array([1]))
Ejemplo n.º 2
0
    def test_df_iter(self):
        trans = n.CatTransform(name='cat',
                               index_to_cat_val=np.unique(
                                   self.array[:, 0:2].astype(np.str)),
                               input_dtype=np.dtype('S'))
        array_iter = [
            self.array[0:2, 0:2].astype(str), self.array[2:4, 0:2].astype(str)
        ]
        df_iter = []
        for array in array_iter:
            df = pd.DataFrame(data=array, index=[0, 1], columns=['a', 'b'])
            df_iter.append(df)

        trans.calc_global_values(data_iter=df_iter)
        target = np.array([[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],
                           [[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0]],
                           [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0]],
                           [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]])
        indices = np.array([[1, 2], [2, 0], [3, 2], [1, 3]])
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 0:2].astype(np.str), {
                    'cat/missing_vals':
                    np.array([['', ''], ['', ''], ['', ''], ['', '']],
                             dtype='|S4'),
                    'cat/one_hots':
                    target,
                    'cat/indices':
                    indices
                })
            trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 3
0
    def test_two_cols(self):
        trans = n.CatTransform(name='cat',
                               index_to_cat_val=np.unique(
                                   self.array[:, 0:2].astype(np.str)))
        array_iter = [
            self.array[0:2, 0:2].astype(str), self.array[2:4, 0:2].astype(str)
        ]

        trans.calc_global_values(data_iter=array_iter)
        target = np.array([[[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],
                           [[0.0, 0.0, 1.0, 0.0], [1.0, 0.0, 0.0, 0.0]],
                           [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 0.0]],
                           [[0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]])
        indices = np.array([[1, 2], [2, 0], [3, 2], [1, 3]])
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 0:2].astype(np.str), {
                    'cat/missing_vals':
                    np.array([['', ''], ['', ''], ['', ''], ['', '']],
                             dtype='|S4'),
                    'cat/one_hots':
                    target,
                    'cat/indices':
                    indices
                })
            trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 4
0
    def test_mean_std(self):
        trans = n.CatTransform(name='cat',
                               norm_mode='mean_std',
                               index_to_cat_val=sorted(
                                   np.unique(self.array[:,
                                                        1:2].astype(np.str))))

        array_iter = [
            self.array[0:2, 1:2].astype(str), self.array[2:4, 1:2].astype(str)
        ]

        trans.calc_global_values(data_iter=array_iter)
        target = np.array([
            [[0., 1, 0]],
            [[1, 0, 0]],
            [[0, 1, 0]],
            [[0, 0, 1]],
        ]).astype(float)
        # print trans.mean
        target = (target - trans.mean) / trans.std
        indices = np.array([[1], [0], [1], [2]])
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 1:2].astype(np.str), {
                    'cat/missing_vals':
                    np.array([[''], [''], [''], ['']], dtype='|S4'),
                    'cat/one_hots':
                    target,
                    'cat/indices':
                    indices
                })
            trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 5
0
    def test_read_write(self):

        for i in xrange(3):
            if i in (0, 1):
                array = self.array[:, i:i + 1].astype(np.str)
            else:
                array = self.array[:, i:i + 1].astype(np.float)
            trans = n.CatTransform(name='cat', norm_mode='mean_std')

            trans.calc_global_values(array)
            self.write_read_example(trans, array, self.temp_dir)
Ejemplo n.º 6
0
 def test_index_to_cat_val(self):
     trans = n.CatTransform(name='cat', index_to_cat_val=['a', 'b'])
     trans.calc_global_values(self.array[:, 0:1].astype(np.str))
     target = np.array([[[1.0, 0.0]], [[0.0, 1.0]], [[0.0, 0.0]],
                        [[1.0, 0.0]]]).astype(float)
     indices = np.array([[0], [1], [-1], [0]])
     for i in xrange(2):
         self.pour_pump(
             trans, self.array[:, 0:1].astype(np.str), {
                 'cat/missing_vals': [[''], [''], ['c'], ['']],
                 'cat/one_hots': target,
                 'cat/indices': indices
             })
         trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 7
0
 def test_no_norm(self):
     trans = n.CatTransform(name='cat')
     trans.calc_global_values(self.array[:, 0:1].astype(np.str))
     target = np.array([[[1., 0., 0.]], [[0., 1., 0.]], [[0., 0., 1.]],
                        [[1., 0., 0.]]])
     indices = np.array([[0], [1], [2], [0]])
     for i in xrange(2):
         self.pour_pump(
             trans, self.array[:, 0:1].astype(np.str), {
                 'cat/missing_vals': [[''], [''], [''], ['']],
                 'cat/one_hots': target,
                 'cat/indices': indices
             })
         trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 8
0
 def test_null(self):
     trans = n.CatTransform(name='cat', )
     trans.calc_global_values(self.array[:, 2:3].astype(np.float64))
     target = np.array([[[0, 1, 0]], [[0, 0, 1]], [[1, 0, 0]],
                        [[0, 1, 0]]]).astype(float)
     indices = np.array([[1], [2], [0], [1]])
     for i in xrange(2):
         self.pour_pump(
             trans, self.array[:, 2:3].astype(np.float64), {
                 'cat/missing_vals':
                 np.array([[0.0], [0.0], [0.0], [0.0]], dtype=float),
                 'cat/one_hots':
                 target,
                 'cat/indices':
                 indices
             })
         trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 9
0
 def test_mean_std(self):
     trans = n.CatTransform(name='cat', norm_mode='mean_std')
     trans.calc_global_values(self.array[:, 1:2].astype(np.str))
     target = np.array([
         [[0., 1, 0]],
         [[1, 0, 0]],
         [[0, 1, 0]],
         [[0, 0, 1]],
     ]).astype(float)
     target = (target - trans.mean) / trans.std
     indices = np.array([[1], [0], [1], [2]])
     for i in xrange(2):
         self.pour_pump(
             trans, self.array[:, 1:2].astype(np.str), {
                 'cat/missing_vals':
                 np.array([[''], [''], [''], ['']], dtype='|S4'),
                 'cat/one_hots':
                 target,
                 'cat/indices':
                 indices
             })
         trans = self.write_read(trans, self.temp_dir)
Ejemplo n.º 10
0
  def test_array(self):
    array = self._get_array()

    dataset_transform = tr.DatasetTransform(name='DT')
    dataset_transform.add_transform(
      cols=[0],
      transform=ct.CatTransform(
        name='CAT',
        norm_mode='mean_std',
        index_to_cat_val=sorted(np.unique(array[:, 0: 1])),
        input_dtype=np.dtype('U')
      )
    )
    dataset_transform.add_transform(
      cols=[1, 2, 3],
      transform=dt.DateTimeTransform(
        name='DATE',
        norm_mode='min_max',
        fill_nat_func=lambda a: np.array(datetime.datetime(1950, 1, 1)),
        input_dtype=np.datetime64,
      )
    )
    dataset_transform.add_transform(
      cols=[4, 5, 6],
      transform=nt.NumTransform(
        name='NUM',
        norm_mode='min_max',
        input_dtype=np.float64,
        fill_nan_func=lambda a: np.array(0),
      )
    )
    dataset_transform.add_transform(
      cols=[7, 8],
      transform=st.StringTransform(
        name='STRING',
        word_tokenizer=en_tokenizer,
        index_to_word=['__UNK__'] + self._get_index_to_word(array[:, 7:9], en_tokenizer),
        input_dtype=np.dtype('U'),
        max_sent_len=20
      )
    )
    dataset_transform.calc_global_values(data=array)

    for i in xrange(2):
      self.pour_pump(
        dataset_transform,
        array,
        {
          'DT/CAT/indices': [[0], [1], [2], [0]],
          'DT/CAT/missing_vals': np.array([[u''], [u''], [u''], [u'']]),
          'DT/CAT/one_hots':  [[[1.0, -0.5773502691896258, -0.5773502691896258]], [[-1.0, 1.7320508075688774, -0.5773502691896258]], [[-1.0, -0.5773502691896258, 1.7320508075688774]], [[1.0, -0.5773502691896258, -0.5773502691896258]]],
          'DT/DATE/nats':  [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/DATE/diff': np.array([[datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)]], dtype='timedelta64[us]'),
          'DT/DATE/nums': np.array([[0.9976643838327857, 0.9976643838327857, 0.0], [0.9977039705474843, 0.0, 0.0], [0.9977435572621828, 0.9988915719884407, 0.0], [0.9976643838327857, 1.0, 0.0]]),
          'DT/NUM/nans': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/NUM/nums': [[0.09090909090909091, 0.18181818181818182, 0.0], [0.36363636363636365, 0.0, 0.0], [0.6363636363636364, 0.7272727272727273, 0.0], [0.9090909090909091, 1.0, 0.0]],
          'DT/STRING/indices': [[[9, 29, 50, 30, 29, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [6, 38, 2, 23, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[7, 16, 43, 28, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [5, 26, 53, 31, 22, 50, 8, 46, 42, 15, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[12, 41, 29, 34, 54, 2, 30, 1, 18, 3, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1], [13, 21, 45, 39, 27, 14, 20, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[4, 32, 33, 14, 48, 44, 31, 51, 47, 43, 52, 17, 3, -1, -1, -1, -1, -1, -1, -1], [11, 1, 24, 19, 36, 43, 40, 35, 25, 37, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]]],
          'DT/STRING/missing_vals': np.array([[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|U10'),
          'DT/STRING/tokenize_diff': np.array([['[["d", 16, 17, ""], ["d", 18, 32, ""]]', '[["d", 8, 9, ""], ["d", 19, 20, ""], ["d", 21, 35, ""]]'], ['[["d", 24, 25, ""], ["d", 26, 40, ""]]', '[["d", 58, 59, ""], ["d", 60, 69, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""], ["d", 42, 43, ""], ["d", 44, 52, ""]]', '[["d", 35, 36, ""], ["d", 37, 49, ""]]'], ['[["d", 2, 3, ""], ["d", 57, 58, ""], ["d", 59, 66, ""]]', '[["d", 3, 4, ""], ["d", 45, 46, ""], ["d", 47, 56, ""]]']], dtype='|U95'),
          # 'DT/Partition_0/tubes/missing_cols': np.array([1, 2]),
          # 'DT/Partition_0/tubes/missing_array': np.array([['b', 'None', 'b', 'c'], [1.0, 2.0, np.nan, 1.0]], dtype=np.object),
        }
      )
      # self.write_read_example(dataset_transform, array, self.temp_dir)
      dataset_transform = self.write_read(dataset_transform, self.temp_dir)