Ejemplo n.º 1
0
    def test_lm_token_preprocessing(self):
        ds = _test_dataset_ints([1, 2, 3], [3, 2, 1])
        ds1 = tf_inputs.lm_token_preprocessing(ds, True)

        # pylint: disable=bad-whitespace
        expected_ds = [
            {
                'inputs': np.array([1, 0, 1, 1, 1], dtype=np.int64),
                'targets': np.array([1, 0, 1, 1, 1], dtype=np.int64),
                'mask': np.array([0, 0, 1, 1, 1], dtype=np.int64),
            },
            {
                'inputs': np.array([1, 1, 0, 1, 1], dtype=np.int64),
                'targets': np.array([1, 1, 0, 1, 1], dtype=np.int64),
                'mask': np.array([0, 0, 0, 1, 1], dtype=np.int64),
            },
            {
                'inputs': np.array([1, 1, 1, 0, 1], dtype=np.int64),
                'targets': np.array([1, 1, 1, 0, 1], dtype=np.int64),
                'mask': np.array([0, 0, 0, 0, 1], dtype=np.int64),
            },
        ]
        # pylint: enable=bad-whitespace

        t5_test_utils.assert_dataset(ds1, expected_ds)
  def test_sample_answer(self):
    input_data = {
        'inputs': ['What are the names of the Olsen Twins?'],
        'targets': ['Mary-Kate'],
        'answers': ['Mary-Kate', 'Ashley']
    }
    og_dataset = tf.data.Dataset.from_tensors(input_data)

    tf.set_random_seed(42)
    test_utils.assert_dataset(
        preprocessors.sample_answer(og_dataset),
        {
            'inputs': 'What are the names of the Olsen Twins?',
            'targets': 'Ashley',
            'answers': ['Ashley', 'Mary-Kate'],
        }
    )
    tf.set_random_seed(420)
    test_utils.assert_dataset(
        preprocessors.sample_answer(og_dataset),
        {
            'inputs': ['What are the names of the Olsen Twins?'],
            'targets': ['Mary-Kate'],
            'answers': ['Mary-Kate', 'Ashley']
        }
    )
Ejemplo n.º 3
0
    def test_eraser_multi_rc_drop_examples(self):
        input_data = {
            'passage':
            'This is a multi line passage. \nIt is about multiple things. '
            '\nThere is more than one thing in it.',
            'query_and_answer':
            'Is it about one thing? || Nope.',
            'label':
            1,
            'evidences': [
                'It is about multiple things.',
                'There is more than one thing in it.'
            ]
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)

        dataset = preprocessors.eraser_multi_rc(og_dataset,
                                                drop_explanations=True)
        test_utils.assert_dataset(
            dataset, {
                'inputs':
                'explain multirc passage: This is a multi line passage. \nIt is '
                'about multiple things. \nThere is more than one thing in it. '
                'query: Is it about one thing? answer: Nope.',
                'targets':
                'True'
            })
Ejemplo n.º 4
0
    def test_amazon_reviews_neutral(self):
        input_data = {
            'data': {
                'review_headline': 'okay headphones',
                'review_body':
                'the sound quality of these headphones is not bad',
                'star_rating': 3,
            }
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)

        dataset = preprocessors.amazon_reviews(og_dataset)

        test_utils.assert_dataset(dataset, [])

        dataset = preprocessors.amazon_reviews(og_dataset, binary_output=False)

        test_utils.assert_dataset(
            dataset, {
                'inputs':
                'sentiment review: okay headphones the sound quality of '
                'these headphones is not bad',
                'targets':
                '3'
            })
Ejemplo n.º 5
0
    def test_process_xquad(self):
        dataset = tf.data.Dataset.from_tensors({
            'id':
            '123',
            'context':
            'Some context.',
            'question':
            'Whose portrait by François Clouet was included' +
            ' in the Jones bequest of 1882?',
            'answers': {
                'text': ['The answer.', 'Another answer.'],
            }
        })

        dataset = preprocessors.xquad(dataset)
        test_utils.assert_dataset(
            dataset, {
                'id':
                '123',
                'inputs':
                'question: Whose portrait by François Clouet was' +
                ' included in the Jones bequest of 1882 ? context: Some'
                ' context . ',
                'targets':
                'The answer . ',
                'context':
                'Some context . ',
                'question':
                'Whose portrait by François Clouet was included' +
                ' in the Jones bequest of 1882 ? ',
                'answers': ['The answer . ', 'Another answer . '],
            })
Ejemplo n.º 6
0
    def test_process_xnl_multiple_langs(self):
        dataset = tf.data.Dataset.from_tensors({
            'hypothesis': {
                'language': ['lang1', 'lang2', 'lang3'],
                'translation':
                ['translation1', 'translation2', 'translation3'],
            },
            'label': 1,
            'premise': {
                'lang1': 'premise1',
                'lang2': 'premise2',
                'lang3': 'premise3'
            }
        })

        dataset = preprocessors.process_xnli(
            dataset, target_languages=['lang1', 'lang2', 'lang3'])
        test_utils.assert_dataset(dataset, [{
            'inputs': 'xnli: premise: premise1 hypothesis: translation1',
            'targets': '1'
        }, {
            'inputs': 'xnli: premise: premise2 hypothesis: translation2',
            'targets': '1'
        }, {
            'inputs': 'xnli: premise: premise3 hypothesis: translation3',
            'targets': '1'
        }])
Ejemplo n.º 7
0
    def test_pad_dataset_to_length(self):
        ds = _test_dataset_ints([5, 6, 7], [6, 7, 8])
        ds1 = tf_inputs.pad_dataset_to_length(ds,
                                              True,
                                              len_map={
                                                  'inputs': 7,
                                                  'targets': 10
                                              })

        expected_ds = [
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 0, 0], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0], dtype=np.int64),
            },
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 1, 0], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64),
            },
            {
                'inputs':
                np.array([1, 1, 1, 1, 1, 1, 1], dtype=np.int64),
                'targets':
                np.array([1, 1, 1, 1, 1, 1, 1, 1, 0, 0], dtype=np.int64),
            },
        ]

        t5_test_utils.assert_dataset(ds1, expected_ds)
Ejemplo n.º 8
0
    def test_amazon_reviews(self):
        input_data = {
            'data': {
                'review_headline': 'Great headphones',
                'review_body': 'Loved the sound quality of these headphones',
                'star_rating': 5,
            }
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)

        dataset = preprocessors.amazon_reviews(og_dataset)

        test_utils.assert_dataset(
            dataset, {
                'inputs': 'sentiment review: Great headphones Loved the '
                'sound quality of these headphones',
                'targets': 'positive'
            })

        dataset = preprocessors.amazon_reviews(og_dataset, binary_output=False)

        test_utils.assert_dataset(
            dataset, {
                'inputs': 'sentiment review: Great headphones Loved the '
                'sound quality of these headphones',
                'targets': '5'
            })
  def test_trim_tokens_at_front(self):
    sequence_length = {'inputs': 4}
    inputs = tf.data.Dataset.from_tensors(
        {'inputs': tf.constant([10, 11, 12, 13, 14, 15])})
    output = prep.trim_tokens_at_front(inputs, sequence_length=sequence_length)

    expected_output = [{'inputs': tf.constant([13, 14, 15])}]
    test_utils.assert_dataset(output, expected_output)
Ejemplo n.º 10
0
  def test_mask_salient_spans(self):
    input_examples = [
        {
            'text': 'He was confident that it would be well received.',
            'spans': {
                'start': [],
                'limit': [],
            }
        },
        {
            'text':
                'The episode was filmed over three days at the end of October '
                'and beginning of November 2002.',
            'spans': {
                'start': [53, 78],
                'limit': [60, 91],
            }
        }
    ]

    og_dataset = tf.data.Dataset.from_generator(
        lambda: (x for x in input_examples),
        output_types={
            'text': tf.string,
            'spans': {
                'start': tf.int64,
                'limit': tf.int64,
            },
        },
        output_shapes={
            'text': [],
            'spans': {
                'start': [None],
                'limit': [None],
            },
        })

    dataset = preprocessors.mask_salient_spans(og_dataset)

    test_utils.assert_dataset(
        dataset,
        [
            {
                'inputs':
                    'nem: The episode was filmed over three days at the end of '
                    '_X_ and beginning of November 2002.',
                'targets': 'October'
            },
            {
                'inputs':
                    'nem: The episode was filmed over three days at the end of '
                    'October and beginning of _X_.',
                'targets': 'November 2002'
            }
        ]
    )
Ejemplo n.º 11
0
    def test_process_mnli(self):
        dataset = tf.data.Dataset.from_tensors({
            'hypothesis': 'hypothesis1',
            'label': 1,
            'premise': 'premise1'
        })

        dataset = preprocessors.process_mnli(dataset)
        test_utils.assert_dataset(
            dataset, {
                'inputs': 'xnli: premise: premise1 hypothesis: hypothesis1',
                'targets': '1'
            })
Ejemplo n.º 12
0
 def test_natural_questions_open(self):
     input_data = {
         'question': ['What are the names of the Olsen Twins?'],
         'answer': ['Mary-Kate', 'Ashley']
     }
     og_dataset = tf.data.Dataset.from_tensors(input_data)
     dataset = preprocessors.natural_questions_open(og_dataset)
     test_utils.assert_dataset(
         dataset, {
             'inputs':
             'nq question: What are the names of the Olsen Twins?',
             'targets': 'Mary-Kate',
             'answers': ['Mary-Kate', 'Ashley'],
         })
Ejemplo n.º 13
0
  def test_get_t5_preprocessor_by_name(self):
    gin.clear_config()

    gin.parse_config("""
      get_t5_preprocessor_by_name.name = 'rekey'
      get_t5_preprocessor_by_name.fn_kwargs = {'key_map': {'inputs': 'other', 'targets': 'text'}}
    """)
    prep_rekey = tf_inputs.get_t5_preprocessor_by_name()
    og_dataset = tf.data.Dataset.from_tensors({
        'text': 'That is good.', 'other': 'That is bad.'})
    training = True
    dataset = prep_rekey(og_dataset, training)
    t5_test_utils.assert_dataset(
        dataset,
        {'inputs': 'That is bad.', 'targets': 'That is good.'})
Ejemplo n.º 14
0
    def test_imdb_movie_reviews(self):
        input_data = {
            'text': ['great movie', 'terrible movie'],
            'label': [1, -1],
        }

        og_dataset = tf.data.Dataset.from_tensor_slices(input_data)

        dataset = preprocessors.imdb_reviews(og_dataset)

        test_utils.assert_dataset(dataset, [{
            'inputs': 'sentiment: great movie',
            'targets': 'positive'
        }, {
            'inputs': 'sentiment: terrible movie',
            'targets': '<unk>'
        }])
Ejemplo n.º 15
0
    def test_esnli_drop_explanations(self):
        input_data = {
            'premise': 'It is hot.',
            'hypothesis': 'It is sunny.',
            'label': 0,
            'explanation_1': 'hot implies that it is sunny.',
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)
        dataset = preprocessors.esnli(og_dataset,
                                      prefix='nli',
                                      drop_explanations=True)

        test_utils.assert_dataset(
            dataset, {
                'inputs': 'nli hypothesis: It is sunny. premise: It is hot.',
                'targets': 'entailment'
            })
Ejemplo n.º 16
0
 def test_cos_e(self):
     input_data = {
         'question': 'Question?',
         'choices': ['First', 'Second', 'Third'],
         'abstractive_explanation': 'Abstractive explanation.',
         'extractive_explanation': 'Not currently used.',
         'answer': 'First',
     }
     og_dataset = tf.data.Dataset.from_tensors(input_data)
     dataset = preprocessors.cos_e(og_dataset)
     test_utils.assert_dataset(
         dataset, {
             'inputs':
             'explain cos_e question: Question? choice: First choice: '
             'Second choice: Third',
             'targets':
             'First explanation: Abstractive explanation.'
         })
Ejemplo n.º 17
0
    def test_rationales_preprocessor_no_explanations(self):
        input_data = {
            'review': 'This was a terrible movie. Complete waste of time.',
            'label': 0,
            'evidences': ['terrible movie', 'waste of time']
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)

        dataset = preprocessors.extractive_explanations(og_dataset,
                                                        drop_explanations=True)
        test_utils.assert_dataset(
            dataset, {
                'inputs':
                'explain sentiment review: This was a terrible movie. '
                'Complete waste of time.',
                'targets': 'negative'
            })
Ejemplo n.º 18
0
 def test_cos_e_zero_shot_like_esnli_functools(self):
     input_data = {
         'question': 'Question?',
         'choices': ['First', 'Second', 'Third'],
         'abstractive_explanation': 'Abstractive explanation.',
         'extractive_explanation': 'Not currently used.',
         'answer': 'First',
     }
     og_dataset = tf.data.Dataset.from_tensors(input_data)
     dataset = functools.partial(preprocessors.cos_e,
                                 prefix='explain nli',
                                 question_prefix='premise:')(og_dataset)
     test_utils.assert_dataset(
         dataset, {
             'inputs':
             'explain nli premise: Question? choice: First choice: '
             'Second choice: Third',
             'targets': 'First explanation: Abstractive explanation.'
         })
Ejemplo n.º 19
0
    def test_esnli_with_choices_like_cos_e(self):
        input_data = {
            'premise': 'It is hot.',
            'hypothesis': 'It is sunny.',
            'label': 0,
            'explanation_1': 'hot implies that it is sunny.'
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)
        dataset = dataset = functools.partial(preprocessors.esnli,
                                              add_choices=True)(og_dataset)

        test_utils.assert_dataset(
            dataset, {
                'inputs':
                ('explain nli hypothesis: It is sunny. premise: It is hot. '
                 'choice: entailment choice: neutral choice: contradiction'),
                'targets':
                'entailment explanation: hot implies that it is sunny.'
            })
Ejemplo n.º 20
0
    def test_trivia_qa_open(self):
        input_data = {
            'question': ['What are the names of the Olsen Twins?'],
            'answer': {
                'value': 'Mary-Kate and Ashley',
                'aliases': ['Mary-Kate and Ashley', 'Ashley and Mary-Kate']
            }
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)

        dataset = preprocessors.trivia_qa_open(og_dataset)

        test_utils.assert_dataset(
            dataset, {
                'inputs':
                'trivia_qa question: What are the names of the Olsen Twins?',
                'targets': 'Mary-Kate and Ashley',
                'answers': ['Mary-Kate and Ashley', 'Ashley and Mary-Kate'],
            })
Ejemplo n.º 21
0
    def test_truncate_dataset_on_len(self):
        ds = _test_dataset_ints([5, 6, 7], [8, 9, 10])
        ds1 = tf_inputs.truncate_dataset_on_len(ds,
                                                True,
                                                len_map={
                                                    'inputs': 6,
                                                    'targets': 4
                                                })
        expected_ds = _test_dataset_ints([5, 6, 6], [4, 4, 4])

        # training, should filter.
        t5_test_utils.assert_dataset(ds1,
                                     list(expected_ds.as_numpy_iterator()))

        # not Training, shouldn't filter.
        ds2 = tf_inputs.truncate_dataset_on_len(ds,
                                                False,
                                                len_map={
                                                    'inputs': 6,
                                                    'targets': 4
                                                })
        t5_test_utils.assert_dataset(ds2, list(ds.as_numpy_iterator()))

        # not Training, but asked to filter, should filter.
        ds3 = tf_inputs.truncate_dataset_on_len(ds,
                                                False,
                                                len_map={
                                                    'inputs': 6,
                                                    'targets': 4
                                                },
                                                truncate_on_eval=True)
        t5_test_utils.assert_dataset(ds3,
                                     list(expected_ds.as_numpy_iterator()))
Ejemplo n.º 22
0
    def test_esnli_multiple_explanations(self):
        input_data = {
            'premise': 'It is hot.',
            'hypothesis': 'It is sunny.',
            'label': 0,
            'explanation_1': 'hot implies that it is sunny.',
            'explanation_2': 'sunny equals hot.',
            'explanation_3': 'hot means sunny.',
        }

        og_dataset = tf.data.Dataset.from_tensors(input_data)
        dataset = preprocessors.esnli(og_dataset)

        test_utils.assert_dataset(
            dataset, {
                'inputs':
                'explain nli hypothesis: It is sunny. premise: It is hot.',
                'targets':
                'entailment explanation: hot implies that it is sunny. '
                'explanation: sunny equals hot. '
                'explanation: hot means sunny.'
            })
  def test_assert_dataset(self):
    first_dataset = tf.data.Dataset.from_tensor_slices(
        {'key1': ['val1'], 'key2': ['val2']})

    # Equal
    assert_dataset(first_dataset, {'key1': [b'val1'], 'key2': [b'val2']})

    # Unequal value
    with self.assertRaises(AssertionError):
      assert_dataset(first_dataset, {'key1': [b'val1'], 'key2': [b'val2x']})

    # Additional key, value
    with self.assertRaises(AssertionError):
      assert_dataset(first_dataset,
                     {'key1': [b'val1'], 'key2': [b'val2'], 'key3': [b'val3']})
Ejemplo n.º 24
0
    def test_natural_questions_nocontext(self):
        input_examples = [{
            'question': {
                'text': 'is the answer to this question no',
            },
            'annotations': {
                'short_answers': {
                    'start_token': ([], [0, 0]),
                    'end_token': ([], [0, 0]),
                    'text': ([], [0, 0])
                },
                'yes_no_answer': [-1, -1]
            }
        }, {
            'question': {
                'text': 'is the answer to this question yes',
            },
            'annotations': {
                'short_answers': {
                    'start_token': ([3, 3], [1, 0, 1, 0]),
                    'end_token': ([7, 5], [1, 0, 1, 0]),
                    'text': (['not sure sir', 'not sure'], [1, 0, 1, 0]),
                },
                'yes_no_answer': [-1, 0, -1, 1]
            }
        }, {
            'question': {
                'text': 'what are the names of the olsen twins',
            },
            'annotations': {
                'short_answers': {
                    'start_token': ([0, 3], [2, 0]),
                    'end_token': ([3, 4], [2, 0]),
                    'text': (['Mary-Kate', 'Ashley'], [2, 0])
                },
                'yes_no_answer': [-1, -1]
            }
        }]

        def _short_ans_to_ragged(ex):
            for field in ['start_token', 'end_token', 'text']:
                values, row_lengths = ex['annotations']['short_answers'][field]
                ex['annotations']['short_answers'][field] = (
                    tf.RaggedTensor.from_row_lengths(values, row_lengths))
            return ex

        og_dataset = tf.data.Dataset.from_generator(
            lambda: (x for x in input_examples),
            output_types={
                'question': {
                    'text': tf.string
                },
                'annotations': {
                    'short_answers': {
                        'start_token': (tf.int64, tf.int64),
                        'end_token': (tf.int64, tf.int64),
                        'text': (tf.string, tf.int64)
                    },
                    'yes_no_answer': tf.int64
                }
            },
            output_shapes={
                'question': {
                    'text': []
                },
                'annotations': {
                    'short_answers': {
                        'start_token': ([None], [None]),
                        'end_token': ([None], [None]),
                        'text': ([None], [None]),
                    },
                    'yes_no_answer': [None]
                }
            }).map(_short_ans_to_ragged)

        dataset = preprocessors.natural_questions_nocontext(og_dataset)
        test_utils.assert_dataset(
            dataset,
            [{
                'inputs': 'nq question: is the answer to this question yes',
                'targets': 'answer: no answer: yes answer: not sure sir '
                'answer: not sure',
                'short_answers/values': ['not sure sir', 'not sure'],
                'short_answers/row_starts': [0, 1, 1, 2],
                'yes_no_answers': [-1, 0, -1, 1],
            }, {
                'inputs': 'nq question: what are the names of the olsen twins',
                'targets': 'answer: Mary-Kate answer: Ashley',
                'short_answers/values': ['Mary-Kate', 'Ashley'],
                'short_answers/row_starts': [0, 2],
                'yes_no_answers': [-1, -1],
            }])

        dataset = preprocessors.natural_questions_nocontext(og_dataset,
                                                            drop_yes_no=True)
        test_utils.assert_dataset(
            dataset, [{
                'inputs': 'nq question: is the answer to this question yes',
                'targets': 'answer: not sure sir answer: not sure',
                'short_answers/values': ['not sure sir', 'not sure'],
                'short_answers/row_starts': [0, 1, 1, 2],
                'yes_no_answers': [-1, -1, -1, -1],
            }, {
                'inputs': 'nq question: what are the names of the olsen twins',
                'targets': 'answer: Mary-Kate answer: Ashley',
                'short_answers/values': ['Mary-Kate', 'Ashley'],
                'short_answers/row_starts': [0, 2],
                'yes_no_answers': [-1, -1],
            }])

        dataset = preprocessors.natural_questions_nocontext(og_dataset,
                                                            max_tokens=2)
        test_utils.assert_dataset(
            dataset, [{
                'inputs': 'nq question: is the answer to this question yes',
                'targets': 'answer: no answer: yes answer: not sure',
                'short_answers/values': ['not sure'],
                'short_answers/row_starts': [0, 0, 0, 1],
                'yes_no_answers': [-1, 0, -1, 1],
            }, {
                'inputs': 'nq question: what are the names of the olsen twins',
                'targets': 'answer: Ashley',
                'short_answers/values': ['Ashley'],
                'short_answers/row_starts': [0, 1],
                'yes_no_answers': [-1, -1],
            }])

        dataset = preprocessors.natural_questions_nocontext(og_dataset,
                                                            max_answers=1)
        test_utils.assert_dataset(
            dataset, [{
                'inputs': 'nq question: is the answer to this question yes',
                'targets': 'answer: no',
                'short_answers/values': ['not sure sir', 'not sure'],
                'short_answers/row_starts': [0, 1, 1, 2],
                'yes_no_answers': [-1, 0, -1, 1],
            }, {
                'inputs': 'nq question: what are the names of the olsen twins',
                'targets': 'answer: Mary-Kate',
                'short_answers/values': ['Mary-Kate', 'Ashley'],
                'short_answers/row_starts': [0, 2],
                'yes_no_answers': [-1, -1],
            }])

        dataset = preprocessors.natural_questions_nocontext(og_dataset,
                                                            drop_yes_no=True,
                                                            max_tokens=2,
                                                            max_answers=1)
        test_utils.assert_dataset(
            dataset, [{
                'inputs': 'nq question: is the answer to this question yes',
                'targets': 'answer: not sure',
                'short_answers/values': ['not sure'],
                'short_answers/row_starts': [0, 0, 0, 1],
                'yes_no_answers': [-1, -1, -1, -1],
            }, {
                'inputs': 'nq question: what are the names of the olsen twins',
                'targets': 'answer: Ashley',
                'short_answers/values': ['Ashley'],
                'short_answers/row_starts': [0, 1],
                'yes_no_answers': [-1, -1],
            }])

        dataset = preprocessors.natural_questions_nocontext(og_dataset,
                                                            drop_yes_no=True,
                                                            max_tokens=1)
        test_utils.assert_dataset(dataset, [{
            'inputs': 'nq question: what are the names of the olsen twins',
            'targets': 'answer: Ashley',
            'short_answers/values': ['Ashley'],
            'short_answers/row_starts': [0, 1],
            'yes_no_answers': [-1, -1],
        }])
Ejemplo n.º 25
0
    def test_rank_classification(self):
        input_examples = [
            {
                'premise': 'The farmland needed irrigation.',
                'question': 'effect',
                'choice1': 'a canal was constructed',
                'choice2': 'the crops grew tall',
                'label': 0,
            },
            {
                'premise': 'I decided to stay home last night.',
                'question': 'cause',
                'choice1': 'I wanted to see people',
                'choice2': 'I was too tired',
                'label': 1,
            },
        ]

        input_ds = tf.data.Dataset.from_generator(lambda:
                                                  (x for x in input_examples),
                                                  output_types={
                                                      'premise': tf.string,
                                                      'question': tf.string,
                                                      'choice1': tf.string,
                                                      'choice2': tf.string,
                                                      'label': tf.int32,
                                                  },
                                                  output_shapes={
                                                      'premise': [],
                                                      'question': [],
                                                      'choice1': [],
                                                      'choice2': [],
                                                      'label': [],
                                                  })

        # all options
        dataset = prep.rank_classification(
            input_ds,
            inputs_format='{premise} What is the {question}? X',
            targets_formats=['I think {choice1}.', 'I think {choice2}.'],
            mode='eval')

        test_utils.assert_dataset(dataset, [
            {
                'idx': 0,
                'inputs':
                'The farmland needed irrigation. What is the effect? X',
                'targets': 'I think a canal was constructed.',
                'label': 0
            },
            {
                'idx': 0,
                'inputs':
                'The farmland needed irrigation. What is the effect? X',
                'targets': 'I think the crops grew tall.',
                'label': 0
            },
            {
                'idx': 1,
                'inputs':
                'I decided to stay home last night. What is the cause? X',
                'targets': 'I think I wanted to see people.',
                'label': 1
            },
            {
                'idx': 1,
                'inputs':
                'I decided to stay home last night. What is the cause? X',
                'targets': 'I think I was too tired.',
                'label': 1
            },
        ])

        # label option only
        dataset = prep.rank_classification(
            input_ds,
            inputs_format='{premise} What is the {question}? X',
            targets_formats=['I think {choice1}.', 'I think {choice2}.'],
            mode='train')

        test_utils.assert_dataset(dataset, [
            {
                'idx': 0,
                'inputs':
                'The farmland needed irrigation. What is the effect? X',
                'targets': 'I think a canal was constructed.',
                'label': 0
            },
            {
                'idx': 1,
                'inputs':
                'I decided to stay home last night. What is the cause? X',
                'targets': 'I think I was too tired.',
                'label': 1
            },
        ])

        # label option only, repeated
        dataset = prep.rank_classification(
            input_ds,
            inputs_format='{premise} What is the {question}? X',
            targets_formats=['I think {choice1}.', 'I think {choice2}.'],
            mode='fewshot_train')

        test_utils.assert_dataset(dataset, [
            {
                'idx': 0,
                'inputs':
                'The farmland needed irrigation. What is the effect? X',
                'targets': 'I think a canal was constructed.',
                'label': 0
            },
            {
                'idx': 0,
                'inputs':
                'The farmland needed irrigation. What is the effect? X',
                'targets': 'I think a canal was constructed.',
                'label': 0
            },
            {
                'idx': 1,
                'inputs':
                'I decided to stay home last night. What is the cause? X',
                'targets': 'I think I was too tired.',
                'label': 1
            },
            {
                'idx': 1,
                'inputs':
                'I decided to stay home last night. What is the cause? X',
                'targets': 'I think I was too tired.',
                'label': 1
            },
        ])
Ejemplo n.º 26
0
    def test_assert_dataset(self):
        first_dataset = tf.data.Dataset.from_tensor_slices({
            'key1': ['val1'],
            'key2': ['val2']
        })

        # Equal
        assert_dataset(first_dataset, {'key1': [b'val1'], 'key2': [b'val2']})
        assert_dataset(first_dataset, {
            'key1': [b'val1'],
            'key2': [b'val2']
        },
                       expected_dtypes={'key1': tf.string})

        # Unequal value
        with self.assertRaises(AssertionError):
            assert_dataset(first_dataset, {
                'key1': [b'val1'],
                'key2': [b'val2x']
            })

        # Wrong dtype
        with self.assertRaises(AssertionError):
            assert_dataset(first_dataset, {
                'key1': [b'val1'],
                'key2': [b'val2']
            },
                           expected_dtypes={'key1': tf.int32})

        # Additional key, value
        with self.assertRaises(AssertionError):
            assert_dataset(first_dataset, {
                'key1': [b'val1'],
                'key2': [b'val2'],
                'key3': [b'val3']
            })

        # Additional key, value
        with self.assertRaises(AssertionError):
            assert_dataset(first_dataset, {
                'key1': [b'val1'],
                'key2': [b'val2'],
                'key3': [b'val3']
            })