Esempio n. 1
0
    def test_errors(self):

        with self.assertRaises(ValueError):
            trans = n.NumTransform(norm_mode='whatever', name='num')

        trans = n.NumTransform(
            name='num',
            norm_mode='min_max',
        )

        with self.assertRaises(AssertionError):
            trans.get_waterwork()
Esempio n. 2
0
 def test_no_norm(self):
     trans = n.NumTransform(name='num')
     trans.calc_global_values(self.array[:, 0:1])
     target = self.array[:, 0:1]
     for i in xrange(2):
         self.pour_pump(
             trans, self.array[:, 0:1], {
                 'num/nums': target,
                 'num/nans': [[False], [False], [False], [False]],
             })
         trans = self.write_read(trans, self.temp_dir)
Esempio n. 3
0
    def test_read_write(self):
        def fill(array):
            return np.array(0.0)

        trans = n.NumTransform(
            name='NUM',
            norm_mode='min_max',
            fill_nan_func=lambda a: np.array(0),
        )
        trans.calc_global_values(self.array)
        for i in xrange(2):
            self.write_read_example(trans,
                                    self.array,
                                    self.temp_dir,
                                    test_type=False)
Esempio n. 4
0
    def test_mean_std(self):
        def fill(array):
            return np.array(0.)

        trans = n.NumTransform(name='num',
                               norm_mode='mean_std',
                               fill_nan_func=fill)
        trans.calc_global_values(self.array[:, 0:1])
        target = self.array[:, 0:1]
        target = (target - trans.mean) / (trans.std)
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 0:1], {
                    'num/nums': target,
                    'num/nans': [[False], [False], [False], [False]],
                })
            trans = self.write_read(trans, self.temp_dir)
Esempio n. 5
0
    def test_nan(self):
        def fill(array):
            mins = np.expand_dims(np.nanmin(array, axis=0), axis=0)
            mins = np.tile(mins, reps=[4, 1])
            replace_with = mins[np.isnan(array)]
            return replace_with

        trans = n.NumTransform(name='num', fill_nan_func=fill)
        trans.calc_global_values(self.array[:, 1:2])
        target = np.array(self.array[:, 1:2], copy=True)
        target[1, 0] = 2
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 1:2], {
                    'num/nums': target,
                    'num/nans': [[False], [True], [False], [False]],
                })
            trans = self.write_read(trans, self.temp_dir)
Esempio n. 6
0
    def test_min_max(self):
        def fill(array):
            return np.array(0.0)

        trans = n.NumTransform(name='num',
                               norm_mode='min_max',
                               fill_nan_func=fill,
                               norm_axis=0)
        trans.calc_global_values(self.array[:, 0:2])
        target = self.array[:, 0:2]
        target = (target - trans.min) / (trans.max - trans.min)
        target[1, 1] = -trans.min[1] / (trans.max[1] - trans.min[1])
        for i in xrange(2):
            self.pour_pump(
                trans, self.array[:, 0:2], {
                    'num/nums':
                    target,
                    'num/nans': [[False, False], [False, True], [False, False],
                                 [False, False]],
                })
            trans = self.write_read(trans, self.temp_dir)
  def test_array(self):
    array = self._get_array()

    dataset_transform = tr.DatasetTransform(name='DT')
    dataset_transform.add_transform(
      cols=[0],
      transform=ct.CatTransform(
        name='CAT',
        norm_mode='mean_std',
        index_to_cat_val=sorted(np.unique(array[:, 0: 1])),
        input_dtype=np.dtype('U')
      )
    )
    dataset_transform.add_transform(
      cols=[1, 2, 3],
      transform=dt.DateTimeTransform(
        name='DATE',
        norm_mode='min_max',
        fill_nat_func=lambda a: np.array(datetime.datetime(1950, 1, 1)),
        input_dtype=np.datetime64,
      )
    )
    dataset_transform.add_transform(
      cols=[4, 5, 6],
      transform=nt.NumTransform(
        name='NUM',
        norm_mode='min_max',
        input_dtype=np.float64,
        fill_nan_func=lambda a: np.array(0),
      )
    )
    dataset_transform.add_transform(
      cols=[7, 8],
      transform=st.StringTransform(
        name='STRING',
        word_tokenizer=en_tokenizer,
        index_to_word=['__UNK__'] + self._get_index_to_word(array[:, 7:9], en_tokenizer),
        input_dtype=np.dtype('U'),
        max_sent_len=20
      )
    )
    dataset_transform.calc_global_values(data=array)

    for i in xrange(2):
      self.pour_pump(
        dataset_transform,
        array,
        {
          'DT/CAT/indices': [[0], [1], [2], [0]],
          'DT/CAT/missing_vals': np.array([[u''], [u''], [u''], [u'']]),
          'DT/CAT/one_hots':  [[[1.0, -0.5773502691896258, -0.5773502691896258]], [[-1.0, 1.7320508075688774, -0.5773502691896258]], [[-1.0, -0.5773502691896258, 1.7320508075688774]], [[1.0, -0.5773502691896258, -0.5773502691896258]]],
          'DT/DATE/nats':  [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/DATE/diff': np.array([[datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)]], dtype='timedelta64[us]'),
          'DT/DATE/nums': np.array([[0.9976643838327857, 0.9976643838327857, 0.0], [0.9977039705474843, 0.0, 0.0], [0.9977435572621828, 0.9988915719884407, 0.0], [0.9976643838327857, 1.0, 0.0]]),
          'DT/NUM/nans': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]],
          'DT/NUM/nums': [[0.09090909090909091, 0.18181818181818182, 0.0], [0.36363636363636365, 0.0, 0.0], [0.6363636363636364, 0.7272727272727273, 0.0], [0.9090909090909091, 1.0, 0.0]],
          'DT/STRING/indices': [[[9, 29, 50, 30, 29, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [6, 38, 2, 23, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[7, 16, 43, 28, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [5, 26, 53, 31, 22, 50, 8, 46, 42, 15, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[12, 41, 29, 34, 54, 2, 30, 1, 18, 3, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1], [13, 21, 45, 39, 27, 14, 20, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[4, 32, 33, 14, 48, 44, 31, 51, 47, 43, 52, 17, 3, -1, -1, -1, -1, -1, -1, -1], [11, 1, 24, 19, 36, 43, 40, 35, 25, 37, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]]],
          'DT/STRING/missing_vals': np.array([[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|U10'),
          'DT/STRING/tokenize_diff': np.array([['[["d", 16, 17, ""], ["d", 18, 32, ""]]', '[["d", 8, 9, ""], ["d", 19, 20, ""], ["d", 21, 35, ""]]'], ['[["d", 24, 25, ""], ["d", 26, 40, ""]]', '[["d", 58, 59, ""], ["d", 60, 69, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""], ["d", 42, 43, ""], ["d", 44, 52, ""]]', '[["d", 35, 36, ""], ["d", 37, 49, ""]]'], ['[["d", 2, 3, ""], ["d", 57, 58, ""], ["d", 59, 66, ""]]', '[["d", 3, 4, ""], ["d", 45, 46, ""], ["d", 47, 56, ""]]']], dtype='|U95'),
          # 'DT/Partition_0/tubes/missing_cols': np.array([1, 2]),
          # 'DT/Partition_0/tubes/missing_array': np.array([['b', 'None', 'b', 'c'], [1.0, 2.0, np.nan, 1.0]], dtype=np.object),
        }
      )
      # self.write_read_example(dataset_transform, array, self.temp_dir)
      dataset_transform = self.write_read(dataset_transform, self.temp_dir)