def test_output(self): """ Testing a simple scenario when a token matching function, cleaner, and a simple token splitter are used. """ field_name = "dummy" special_token = "<ANIMAL>" lower_case = True tok_mat_func = lambda x: token_matching_func(x, special_token) token_cleaning_func = lambda x: re.sub(r'[?!,.]', '', x) tokenization_func = lambda x: x.split() input_seqs = ["Hello, this is my dog!", "A dummy sentence for tokenization.", "What a lovely puppy!"] input_data_chunk = DataChunk(**{field_name: input_seqs}) expect_seqs = [["hello", "this", "is", "my", special_token], ["a", "dummy", "sentence", "for", "tokenization"], ["what", "a", "lovely", special_token]] expected_data_chunk = DataChunk(**{field_name: expect_seqs}) tokenizer = TokenProcessor(field_name, tokenization_func=tokenization_func, token_cleaning_func=token_cleaning_func, token_matching_func=tok_mat_func, lower_case=lower_case) actual_data_chunk = tokenizer(input_data_chunk) self.assertTrue(expected_data_chunk == actual_data_chunk)
def test_vocabulary_mapper_mixed_field_values(self): """Testing whether the mapper can map multi-dim mixed field values.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk(**{target_field_name: np.array([ [["one"], np.array(["two", "one"])], [["three"], np.array(["four", "five", "six"])] ], dtype="object")}) expected_output_chunk = DataChunk(**{target_field_name: np.array([ [[1], np.array([2, 1])], [[3], np.array([4, 5, 6])] ], dtype="object")}) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(data_chunk) self.assertTrue(actual_output_chunk == expected_output_chunk)
def test_vocabulary_mapper_multidim_lists(self): """Testing whether the mapper can map multi-dim lists.""" target_field_name = "dummy" symbols_attr = "id" data_chunk = DataChunk( **{ target_field_name: np.array( [[["one"], ["two"]], [["three"], ["four", "five", "six"]]], dtype="object") }) exp_val = np.empty(2, dtype="object") exp_val[0] = np.array([[1], [2]]) exp_val[1] = np.array([[3], [4, 5, 6]]) expected_output_chunk = DataChunk(**{target_field_name: exp_val}) # creating and populating a vocab vocab = Vocabulary() vocab.add_symbol("zero") vocab.add_symbol("one") vocab.add_symbol("two") vocab.add_symbol("three") vocab.add_symbol("four") vocab.add_symbol("five") vocab.add_symbol("six") mapper = VocabMapper({target_field_name: vocab}, symbols_attr=symbols_attr) actual_output_chunk = mapper(copy.deepcopy(data_chunk)) self.assertTrue(actual_output_chunk == expected_output_chunk)
def compile_chunks(self): """Compiles data-chunks filled with group sequences.""" if self.max_units: while len(self): if len(self) > self.max_units: sel_indxs = random.choice(range(len(self)), replace=False, size=self.max_units) else: sel_indxs = range(len(self)) # create an output data-chunk based on the selected units dc = DataChunk() for k, val in self._coll.items(): dc[k] = [val[indx] for indx in sel_indxs] if isinstance(val, np.ndarray): dc[k] = np.array(dc[k], dtype=val.dtype) yield dc # removing the selected indxs from the collector for indx in sorted(sel_indxs, reverse=True): for fn in self._coll: if isinstance(self._coll[fn], np.ndarray): self._coll[fn] = np.delete(self._coll[fn], indx) else: del self._coll[fn][indx] # stop the cycle as one sample is already produced if not self.sample_all_revs: break else: dc = DataChunk() for k, val in self._coll.items(): dc[k] = val yield dc
def test_appending_data_units_to_invalid_dc(self): act_dc = DataChunk(test=np.array([], dtype='int64'), dummy=np.array([1], dtype='int64')) with self.assertRaises(DataChunkError): act_dc.append({"test": 1, "dummy": 2})
def test_setting_data_units(self): act_dc = DataChunk(test=np.array([1, 2, 3, 4, 5], dtype='int64'), dummy=np.array([6, 7, 8, 9, 10], dtype='int64')) act_dc[0] = {"test": 0, "dummy": 0} act_dc[3] = {"test": 20, "dummy": 30} exp_dc = DataChunk(test=np.array([0, 2, 3, 20, 5]), dummy=np.array([0, 7, 8, 30, 10])) self.assertTrue(act_dc == exp_dc)
def test_appending_data_units_to_valid_dc(self): act_dc = DataChunk(test=np.array([], dtype='int64'), dummy=np.array([], dtype='int64')) act_dc.append({"test": 1, "dummy": 2}) act_dc.append({"test": 3, "dummy": 4}) exp_dc = DataChunk(test=np.array([1, 3]), dummy=np.array([2, 4])) self.assertTrue(act_dc == exp_dc)
def test_output(self): fn = "dummy" new_fn = "dummy_len" data = [[1, 2, 3], [12], ["a", "b", "d", "e"]] actual_dc = DataChunk(**{fn: np.array(deepcopy(data))}) expected_dc = DataChunk(**{fn: np.array(deepcopy(data)), new_fn: np.array([3, 1, 4])}) slc = SeqLenComputer(fname=fn, new_len_fname=new_fn) actual_dc = slc(actual_dc) self.assertTrue(actual_dc == expected_dc)
def _data_chunk_from_dicts_list(list_of_dicts): """Creates a data-chunk from list of data-units (dicts).""" data_chunk = DataChunk() flag = False for du in list_of_dicts: if not flag: for k in du.keys(): data_chunk[k] = [] flag = True for k, v in du.items(): data_chunk[k].append(v) for k, v in data_chunk.items(): data_chunk[k] = np.array(v) return data_chunk
def test_modification_of_data_units(self): """Selecting specific data-units and altering their values.""" act_dc = DataChunk(test=np.array([1, 2, 3, 4]), dummy=np.array([11., 12., 13., 14.])) act_du1 = act_dc[0] act_du1['test'] += 100 act_du2 = act_dc[3] act_du2['dummy'] += 5 exp_dc = DataChunk(test=np.array([101, 2, 3, 4]), dummy=np.array([11., 12., 13., 19.])) self.assertTrue(act_dc == exp_dc)
def concat_data_chunks(*args): """Concatenates data-chunks together based on their keys.""" data_chunk = DataChunk() for k in args[0]: data_chunk[k] = [] for arg in args: for k, v in arg.items(): data_chunk[k].append(v) for k, v in data_chunk.items(): data_chunk[k] = np.concatenate(v) return data_chunk
def test_data_units_inval_access(self): """When data-chunk is incorrect, it should throw an error.""" dc = DataChunk(test=[1, 22, 3, 4, 5], dummy=[99, 2, 3]) with self.assertRaises(DataChunkError): du = dc[0] dc["test"] = np.array(dc["test"]) dc['dummy'] = np.array(dc['dummy']) with self.assertRaises(DataChunkError): du = dc[0] dc['dummy'] = np.append(dc['dummy'], 0) dc['dummy'] = np.append(dc['dummy'], 1) du = dc[0] self.assertTrue(du['test'] == 1) self.assertTrue(du['dummy'] == 99) du = dc[1] self.assertTrue(du['test'] == 22) self.assertTrue(du['dummy'] == 2)
def _get_dummy_dc(): dc = DataChunk() dc["country"] = np.array(["UK", "UK", "UK", "DK", "DK"]) dc["shop_id"] = np.array(['1', '1', '1', '2', '3']) dc["product_id"] = np.array([11, 12, 13, 101, 101]) dc["sales"] = np.array([0, 1, 2, 5, 6]) return dc
def concat_chunks(*dcs): """Combines data-chunks horizontally and returns them as one chunk.""" new_dc = DataChunk() key_to_type = {} for dc in dcs: for k, v in dc.items(): if k not in new_dc: new_dc[k] = [] if isinstance(v, np.ndarray): if k in key_to_type and key_to_type[k] != np.ndarray: raise TypeError("All values must either 'arrays' or " "'lists'.") key_to_type[k] = np.ndarray new_dc[k].append(v) elif isinstance(v, list): if k in key_to_type and key_to_type[k] != list: raise TypeError("All values must either 'arrays' or " "'lists'.") key_to_type[k] = list new_dc[k] += v else: raise TypeError("Can't concat values other than 'lists' or " "'arrays'.") for k in new_dc: if key_to_type[k] == np.ndarray: new_dc[k] = np.concatenate(tuple(new_dc[k])) return new_dc
def _transform(self, data_chunk): fields_to_copy = [YelpEvalF.BUSINESS_ID] new_dc = DataChunk(**{fn: [] for fn in fields_to_copy}) summ_cats = ["no_cat" for _ in range(len(data_chunk))] # wrapping each summary to a list as there is only one summary per # business new_dc[ModelF.SUMMS] = [[summ] for summ in data_chunk[YelpEvalF.SUMM]] new_dc[ModelF.SUMM_CAT] = summ_cats new_dc[ModelF.SUMM_GROUP_ID] = data_chunk[YelpEvalF.BUSINESS_ID] # splitting data-units by the reviews field. I.e. each unit will have # one review associated with it new_dc[ModelF.REV] = [] for du in data_chunk.iter(): for rev_fn in YelpEvalF.REVS: new_dc[ModelF.REV].append(du[rev_fn]) # copying the rest for c_fn in fields_to_copy: new_dc[c_fn].append(du[c_fn]) # adding dummy category field cat_fvals = ["no_cat" for _ in range(len(new_dc))] new_dc[ModelF.CAT] = cat_fvals return new_dc
def _transform(self, data_chunk): new_dc = DataChunk() for k, v in data_chunk.items(): if k in self.old_to_new_fnames: k = self.old_to_new_fnames[k] new_dc[k] = v return new_dc
def _transform(self, data_chunk): fields_to_copy = [AmazonEvalF.PROD_ID, AmazonEvalF.CAT] new_dc = DataChunk(**{fn: [] for fn in fields_to_copy}) new_dc[ModelF.SUMMS] = [[summ1, summ2, summ3] for summ1, summ2, summ3 in zip(data_chunk[AmazonEvalF.SUMM1], data_chunk[AmazonEvalF.SUMM2], data_chunk[AmazonEvalF.SUMM3])] new_dc[ModelF.SUMM_CAT] = data_chunk[AmazonEvalF.CAT] new_dc[ModelF.SUMM_GROUP_ID] = data_chunk[AmazonEvalF.PROD_ID] # splitting data-units by the reviews field. I.e. each unit will have # one review associated with it new_dc[ModelF.REV] = [] for du in data_chunk.iter(): for rev_fn in AmazonEvalF.REVS: new_dc[ModelF.REV].append(du[rev_fn]) # copying the rest for c_fn in fields_to_copy: new_dc[c_fn].append(du[c_fn]) return new_dc
def test_3D_padding(self): """Light version test to check if the padder works for 3D data.""" field_name = "dummy" mask_field_name = 'dummy_mask' pad_symbol = -99 mask_fn_suffix = "mask" padding_mode = "both" axis = 2 data_chunk = DataChunk( **{ field_name: np.array([[[0, 1, 2], [3, 4, 5], [], [6]], [[1], [1, 2], []]]) }) padder = Padder(field_name, pad_symbol=pad_symbol, axis=axis, new_mask_fname=mask_field_name, padding_mode=padding_mode) padded_data_chunk = padder(copy.deepcopy(data_chunk)) original_fv = data_chunk[field_name] padded_fv = padded_data_chunk[field_name] mask = padded_data_chunk[mask_field_name] for ofv, pfv, m in zip(original_fv, padded_fv, mask): self._test_padded_values(original_field_values=ofv, padded_field_values=pfv, mask=m, pad_symbol=pad_symbol)
def test_sorting_by_ints_descending(self): expected_dc = DataChunk(**{ self.ints_fn: np.array([123, 10, 0]), self.strings_fn: np.array(["d", "a", "c"]), self.floats_fn: np.array([15., -1, -10.]) }) actual_dc = self._run_sorter(fn=self.ints_fn, order='descending') self.assertTrue(expected_dc == actual_dc)
def generate_data_chunk(data_attrs_number, data_size): """Generated a data-chunk with random 1D values of data_size.""" data = { str(i): np.random.rand(data_size) for i in range(data_attrs_number) } data_chunk = DataChunk(**data) return data_chunk
def test_valid_data_units_deletion(self): dc = DataChunk(one=np.array([1, 2, 3, 4]), two=np.array([0, 10, 11, 24])) del dc[2] self.assertTrue((np.array([1, 2, 4]) == dc['one']).all()) self.assertTrue((np.array([0, 10, 24]) == dc['two']).all())
def test_sorting_by_string_ascending(self): expected_dc = DataChunk(**{ self.ints_fn: np.array([10, 0, 123]), self.strings_fn: np.array(["a", "c", "d"]), self.floats_fn: np.array([-1., -10, 15.]) }) actual_dc = self._run_sorter(fn=self.strings_fn, order='ascending') self.assertTrue(expected_dc == actual_dc)
def testing_absolute_corruption(self): data_dc = DataChunk( **{DUMMY_FNAME: np.array([range(10) for _ in range(5)])}) exp_dc = DataChunk() exp_dc[DUMMY_FNAME] = deepcopy(data_dc[DUMMY_FNAME]) exp_dc[NEW_DUMMY_FNAME] = np.zeros(len(data_dc), dtype='object') for indx in range(len(exp_dc)): exp_dc[indx, NEW_DUMMY_FNAME] = list() word_dropper = WordDropper(fname=DUMMY_FNAME, new_fname=NEW_DUMMY_FNAME, dropout_prob=1.) act_dc = word_dropper(data_dc) self.assertTrue(act_dc == exp_dc)
def _data_chunk_from_dicts_tree(dicts, tree_grouping_fnames): """Creates a data-chunk from a tree of data-units (dicts).""" def yield_paths_and_leaves(tree, path=None): def is_leaf(dct): for v in dct.values(): if not isinstance(v, list): return False return True if is_leaf(tree): yield path, tree else: for k in tree.keys(): curr_path = [p for p in path] if path else [] curr_path.append(k) for r in yield_paths_and_leaves(tree[k], curr_path): yield r if not tree_grouping_fnames: raise ValueError("Please provide 'tree_grouping_fnames' to parse " "input json files.") data_chunk = DataChunk() for fn in tree_grouping_fnames: data_chunk[fn] = [] for path, leaf in yield_paths_and_leaves(dicts): leaf_size = _get_leaf_size(leaf) if len(path) != len(tree_grouping_fnames): raise ValueError("Please provide all grouping fields.") # storing path values for p_val, fn in zip(path, tree_grouping_fnames): data_chunk[fn] += [p_val] * leaf_size # storing leaf values for k, vals in leaf.items(): assert (isinstance(vals, list)) if k not in data_chunk: data_chunk[k] = [] data_chunk[k] += vals for k, v in data_chunk.items(): data_chunk[k] = np.array(v) return data_chunk
def test_scenario2(self): window_size = 3 step_size = 3 only_full_windows = False input_seqs = np.array([list(range(7)), list(range(2))]) input_chunk = DataChunk(**{self.field_name: input_seqs}) expect_seqs = np.array([[[0, 1, 2], [3, 4, 5], [6]], [[0, 1]]]) expected_output_chunk = DataChunk(**{ self.field_name: input_seqs, self.new_field_name: expect_seqs }) self._test_window_setup(input_chunk, expected_output_chunk, field_name=self.field_name, suffix=self.suffix, window_size=window_size, step_size=step_size, only_full_windows=only_full_windows)
def test_field_values_access(self): arrays_size = 40 names = ["one", "two", "three", "four"] for _ in range(10): data = {name: np.random.rand(arrays_size, 1) for name in names} data_chunk = DataChunk(**deepcopy(data)) for name in names: self.assertTrue((data_chunk[name] == data[name]).all())
def test_condition_satisfaction(self): ks = [51.5, 2, 3, 4, 5] for k in ks: word_shuffler = WordShuffler(fname=TEXT_FNAME, end_symbol='DUMMY', k=k) dc = DataChunk( **{ TEXT_FNAME: np.array( [list(range(100)), list(range(30))], dtype='object') }) corr_dc = word_shuffler(deepcopy(dc)) for corr_du, du in zip(corr_dc.iter(), dc.iter()): text = du[TEXT_FNAME] corr_text = corr_du[TEXT_FNAME] self.assertTrue(condition_sat(corr_text, k=k)) self.assertTrue(len(text) == len(corr_text)) self.assertTrue(text != corr_text)
def create_list_of_data_chunks(data_chunk, chunk_size): """Creates a list of data-chunks out of the passed data-chunk.""" collector = [] start_indx = 0 while start_indx < len(data_chunk): slice_range = range(start_indx, min(start_indx + chunk_size, len(data_chunk))) dc = DataChunk(**{k: v[slice_range] for k, v in data_chunk.items()}) collector.append(dc) start_indx += chunk_size return collector
def test_specific_fvalues_access(self): arrays_size = 40 names = ["one", "two", "three", "four"] for _ in range(10): data = {name: np.random.rand(arrays_size) for name in names} data_chunk = DataChunk(**deepcopy(data)) for r_name in np.random.choice(names, size=10, replace=True): for r_indx in np.random.randint(0, 40, size=100): res = (data_chunk[r_indx, r_name] == data[r_name][r_indx]) self.assertTrue(res)
def test_invalid_data_units_deletion(self): """Deletion by data-unit index from data-chunks should not work.""" dc = DataChunk(one=[1, 2, 3, 4], two=[10, 20, 30, 40, 50, 60]) self.assertFalse(dc.valid) try: del dc[2] except ValueError as e: self.assertTrue(True) else: self.assertTrue(False)