Ejemplo n.º 1
0
  def test_lm_token_preprocessing(self):
    ds = _test_dataset_ints([1, 2, 3], [3, 2, 1])
    ds1 = tf_inputs.lm_token_preprocessing(ds, True)

    # pylint: disable=bad-whitespace
    expected_ds = [
        {
            'inputs': np.array([1, 0, 1, 1, 1], dtype=np.int64),
            'targets': np.array([1, 0, 1, 1, 1], dtype=np.int64),
            'mask': np.array([0, 0, 1, 1, 1], dtype=np.int64),
        },
        {
            'inputs': np.array([1, 1, 0, 1, 1], dtype=np.int64),
            'targets': np.array([1, 1, 0, 1, 1], dtype=np.int64),
            'mask': np.array([0, 0, 0, 1, 1], dtype=np.int64),
        },
        {
            'inputs': np.array([1, 1, 1, 0, 1], dtype=np.int64),
            'targets': np.array([1, 1, 1, 0, 1], dtype=np.int64),
            'mask': np.array([0, 0, 0, 0, 1], dtype=np.int64),
        },
    ]
    # pylint: enable=bad-whitespace

    assert_dataset(ds1, expected_ds)
Ejemplo n.º 2
0
    def test_pad_dataset_to_length(self):
        ds = _test_dataset_ints([5, 6, 7], [6, 7, 8])
        ds1 = tf_inputs.pad_dataset_to_length(ds,
                                              True,
                                              len_map={
                                                  'inputs': 7,
                                                  'targets': 10
                                              })

        expected_ds = [
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 0, 0], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], dtype=np.int64),
            },
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 1, 0], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64),
            },
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 1, 1], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0], dtype=np.int64),
            },
        ]

        assert_dataset(ds1, expected_ds)
Ejemplo n.º 3
0
  def test_get_t5_preprocessor_by_name(self):
    gin.clear_config()

    gin.parse_config("""
      get_t5_preprocessor_by_name.name = 'rekey'
      get_t5_preprocessor_by_name.fn_kwargs = {'key_map': {'inputs': 'other', 'targets': 'text'}}
    """)
    prep_rekey = tf_inputs.get_t5_preprocessor_by_name()
    og_dataset = tf.data.Dataset.from_tensors({
        'text': 'That is good.',
        'other': 'That is bad.'
    })
    training = True
    dataset = prep_rekey(og_dataset, training)
    assert_dataset(dataset, {
        'inputs': 'That is bad.',
        'targets': 'That is good.'
    })
Ejemplo n.º 4
0
  def test_truncate_dataset_on_len(self):
    ds = _test_dataset_ints([5, 6, 7], [8, 9, 10])
    ds1 = tf_inputs.truncate_dataset_on_len(
        ds, True, len_map={
            'inputs': 6,
            'targets': 4
        })
    expected_ds = _test_dataset_ints([5, 6, 6], [4, 4, 4])

    # training, should filter.
    assert_dataset(ds1, list(expected_ds.as_numpy_iterator()))

    # not Training, shouldn't filter.
    ds2 = tf_inputs.truncate_dataset_on_len(
        ds, False, len_map={
            'inputs': 6,
            'targets': 4
        })
    assert_dataset(ds2, list(ds.as_numpy_iterator()))

    # not Training, but asked to filter, should filter.
    ds3 = tf_inputs.truncate_dataset_on_len(
        ds, False, len_map={
            'inputs': 6,
            'targets': 4
        }, truncate_on_eval=True)
    assert_dataset(ds3, list(expected_ds.as_numpy_iterator()))