def test_errors(self): with self.assertRaises(ValueError): trans = n.NumTransform(norm_mode='whatever', name='num') trans = n.NumTransform( name='num', norm_mode='min_max', ) with self.assertRaises(AssertionError): trans.get_waterwork()
def test_no_norm(self): trans = n.NumTransform(name='num') trans.calc_global_values(self.array[:, 0:1]) target = self.array[:, 0:1] for i in xrange(2): self.pour_pump( trans, self.array[:, 0:1], { 'num/nums': target, 'num/nans': [[False], [False], [False], [False]], }) trans = self.write_read(trans, self.temp_dir)
def test_read_write(self): def fill(array): return np.array(0.0) trans = n.NumTransform( name='NUM', norm_mode='min_max', fill_nan_func=lambda a: np.array(0), ) trans.calc_global_values(self.array) for i in xrange(2): self.write_read_example(trans, self.array, self.temp_dir, test_type=False)
def test_mean_std(self): def fill(array): return np.array(0.) trans = n.NumTransform(name='num', norm_mode='mean_std', fill_nan_func=fill) trans.calc_global_values(self.array[:, 0:1]) target = self.array[:, 0:1] target = (target - trans.mean) / (trans.std) for i in xrange(2): self.pour_pump( trans, self.array[:, 0:1], { 'num/nums': target, 'num/nans': [[False], [False], [False], [False]], }) trans = self.write_read(trans, self.temp_dir)
def test_nan(self): def fill(array): mins = np.expand_dims(np.nanmin(array, axis=0), axis=0) mins = np.tile(mins, reps=[4, 1]) replace_with = mins[np.isnan(array)] return replace_with trans = n.NumTransform(name='num', fill_nan_func=fill) trans.calc_global_values(self.array[:, 1:2]) target = np.array(self.array[:, 1:2], copy=True) target[1, 0] = 2 for i in xrange(2): self.pour_pump( trans, self.array[:, 1:2], { 'num/nums': target, 'num/nans': [[False], [True], [False], [False]], }) trans = self.write_read(trans, self.temp_dir)
def test_min_max(self): def fill(array): return np.array(0.0) trans = n.NumTransform(name='num', norm_mode='min_max', fill_nan_func=fill, norm_axis=0) trans.calc_global_values(self.array[:, 0:2]) target = self.array[:, 0:2] target = (target - trans.min) / (trans.max - trans.min) target[1, 1] = -trans.min[1] / (trans.max[1] - trans.min[1]) for i in xrange(2): self.pour_pump( trans, self.array[:, 0:2], { 'num/nums': target, 'num/nans': [[False, False], [False, True], [False, False], [False, False]], }) trans = self.write_read(trans, self.temp_dir)
def test_array(self): array = self._get_array() dataset_transform = tr.DatasetTransform(name='DT') dataset_transform.add_transform( cols=[0], transform=ct.CatTransform( name='CAT', norm_mode='mean_std', index_to_cat_val=sorted(np.unique(array[:, 0: 1])), input_dtype=np.dtype('U') ) ) dataset_transform.add_transform( cols=[1, 2, 3], transform=dt.DateTimeTransform( name='DATE', norm_mode='min_max', fill_nat_func=lambda a: np.array(datetime.datetime(1950, 1, 1)), input_dtype=np.datetime64, ) ) dataset_transform.add_transform( cols=[4, 5, 6], transform=nt.NumTransform( name='NUM', norm_mode='min_max', input_dtype=np.float64, fill_nan_func=lambda a: np.array(0), ) ) dataset_transform.add_transform( cols=[7, 8], transform=st.StringTransform( name='STRING', word_tokenizer=en_tokenizer, index_to_word=['__UNK__'] + self._get_index_to_word(array[:, 7:9], en_tokenizer), input_dtype=np.dtype('U'), max_sent_len=20 ) ) dataset_transform.calc_global_values(data=array) for i in xrange(2): self.pour_pump( dataset_transform, array, { 'DT/CAT/indices': [[0], [1], [2], [0]], 'DT/CAT/missing_vals': np.array([[u''], [u''], [u''], [u'']]), 'DT/CAT/one_hots': [[[1.0, -0.5773502691896258, -0.5773502691896258]], [[-1.0, 1.7320508075688774, -0.5773502691896258]], [[-1.0, -0.5773502691896258, 1.7320508075688774]], [[1.0, -0.5773502691896258, -0.5773502691896258]]], 'DT/DATE/nats': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]], 'DT/DATE/diff': np.array([[datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)], [datetime.timedelta(0), datetime.timedelta(0), datetime.timedelta(0)]], dtype='timedelta64[us]'), 'DT/DATE/nums': np.array([[0.9976643838327857, 0.9976643838327857, 0.0], [0.9977039705474843, 0.0, 0.0], [0.9977435572621828, 0.9988915719884407, 0.0], [0.9976643838327857, 1.0, 0.0]]), 'DT/NUM/nans': [[False, False, True], [False, True, True], [False, False, True], [False, False, True]], 'DT/NUM/nums': [[0.09090909090909091, 0.18181818181818182, 0.0], [0.36363636363636365, 0.0, 0.0], [0.6363636363636364, 0.7272727272727273, 0.0], [0.9090909090909091, 1.0, 0.0]], 'DT/STRING/indices': [[[9, 29, 50, 30, 29, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [6, 38, 2, 23, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[7, 16, 43, 28, 49, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [5, 26, 53, 31, 22, 50, 8, 46, 42, 15, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[12, 41, 29, 34, 54, 2, 30, 1, 18, 3, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1], [13, 21, 45, 39, 27, 14, 20, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]], [[4, 32, 33, 14, 48, 44, 31, 51, 47, 43, 52, 17, 3, -1, -1, -1, -1, -1, -1, -1], [11, 1, 24, 19, 36, 43, 40, 35, 25, 37, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1]]], 'DT/STRING/missing_vals': np.array([[[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']], [[u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u''], [u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'', u'']]], dtype='|U10'), 'DT/STRING/tokenize_diff': np.array([['[["d", 16, 17, ""], ["d", 18, 32, ""]]', '[["d", 8, 9, ""], ["d", 19, 20, ""], ["d", 21, 35, ""]]'], ['[["d", 24, 25, ""], ["d", 26, 40, ""]]', '[["d", 58, 59, ""], ["d", 60, 69, ""]]'], ['[["d", 21, 22, ""], ["d", 26, 27, ""], ["d", 37, 38, ""], ["d", 42, 43, ""], ["d", 44, 52, ""]]', '[["d", 35, 36, ""], ["d", 37, 49, ""]]'], ['[["d", 2, 3, ""], ["d", 57, 58, ""], ["d", 59, 66, ""]]', '[["d", 3, 4, ""], ["d", 45, 46, ""], ["d", 47, 56, ""]]']], dtype='|U95'), # 'DT/Partition_0/tubes/missing_cols': np.array([1, 2]), # 'DT/Partition_0/tubes/missing_array': np.array([['b', 'None', 'b', 'c'], [1.0, 2.0, np.nan, 1.0]], dtype=np.object), } ) # self.write_read_example(dataset_transform, array, self.temp_dir) dataset_transform = self.write_read(dataset_transform, self.temp_dir)