def test_lm_token_preprocessing(self): ds = _test_dataset_ints([1, 2, 3], [3, 2, 1]) ds1 = tf_inputs.lm_token_preprocessing(ds, True) # pylint: disable=bad-whitespace expected_ds = [ { 'inputs': np.array([1, 0, 1, 1, 1], dtype=np.int64), 'targets': np.array([1, 0, 1, 1, 1], dtype=np.int64), 'mask': np.array([0, 0, 1, 1, 1], dtype=np.int64), }, { 'inputs': np.array([1, 1, 0, 1, 1], dtype=np.int64), 'targets': np.array([1, 1, 0, 1, 1], dtype=np.int64), 'mask': np.array([0, 0, 0, 1, 1], dtype=np.int64), }, { 'inputs': np.array([1, 1, 1, 0, 1], dtype=np.int64), 'targets': np.array([1, 1, 1, 0, 1], dtype=np.int64), 'mask': np.array([0, 0, 0, 0, 1], dtype=np.int64), }, ] # pylint: enable=bad-whitespace assert_dataset(ds1, expected_ds)
def test_pad_dataset_to_length(self): ds = _test_dataset_ints([5, 6, 7], [6, 7, 8]) ds1 = tf_inputs.pad_dataset_to_length(ds, True, len_map={ 'inputs': 7, 'targets': 10 }) expected_ds = [ { 'inputs': np.array([1, 1, 1, 1, 1, 0, 0], dtype=np.int64), 'targets': np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], dtype=np.int64), }, { 'inputs': np.array([1, 1, 1, 1, 1, 1, 0], dtype=np.int64), 'targets': np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64), }, { 'inputs': np.array([1, 1, 1, 1, 1, 1, 1], dtype=np.int64), 'targets': np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0], dtype=np.int64), }, ] assert_dataset(ds1, expected_ds)
def test_get_t5_preprocessor_by_name(self): gin.clear_config() gin.parse_config(""" get_t5_preprocessor_by_name.name = 'rekey' get_t5_preprocessor_by_name.fn_kwargs = {'key_map': {'inputs': 'other', 'targets': 'text'}} """) prep_rekey = tf_inputs.get_t5_preprocessor_by_name() og_dataset = tf.data.Dataset.from_tensors({ 'text': 'That is good.', 'other': 'That is bad.' }) training = True dataset = prep_rekey(og_dataset, training) assert_dataset(dataset, { 'inputs': 'That is bad.', 'targets': 'That is good.' })
def test_truncate_dataset_on_len(self): ds = _test_dataset_ints([5, 6, 7], [8, 9, 10]) ds1 = tf_inputs.truncate_dataset_on_len( ds, True, len_map={ 'inputs': 6, 'targets': 4 }) expected_ds = _test_dataset_ints([5, 6, 6], [4, 4, 4]) # training, should filter. assert_dataset(ds1, list(expected_ds.as_numpy_iterator())) # not Training, shouldn't filter. ds2 = tf_inputs.truncate_dataset_on_len( ds, False, len_map={ 'inputs': 6, 'targets': 4 }) assert_dataset(ds2, list(ds.as_numpy_iterator())) # not Training, but asked to filter, should filter. ds3 = tf_inputs.truncate_dataset_on_len( ds, False, len_map={ 'inputs': 6, 'targets': 4 }, truncate_on_eval=True) assert_dataset(ds3, list(expected_ds.as_numpy_iterator()))