Beispiel #1
0
    def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False,
                                 use_vocab=False,
                                 dtype=torch.float)
        tsv_fields = [("int", int_field), ("float", float_field),
                      ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        assert_allclose(numericalized_int.data.numpy(), [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data)
        assert_allclose(numericalized_float.data.numpy(),
                        [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(
            sequential=False,
            use_vocab=False,
            postprocessing=lambda arr, _: [x + 1 for x in arr])
        float_field = data.Field(
            sequential=False,
            use_vocab=False,
            dtype=torch.float,
            postprocessing=lambda arr, _: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field),
                      ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        assert_allclose(numericalized_int.data.numpy(), [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data)
        assert_allclose(numericalized_float.data.numpy(),
                        [0.55, 0.05, 1.955, 0.1, 5.1])
Beispiel #2
0
    def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [
            ["When", "do", "you", "use", "シ", "instead", "of", "し?"],
            ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"],
            ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]
        ]
        reversed_test_example_data = [
            list(reversed(sentence)) for sentence in test_example_data
        ]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data))
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized)
Beispiel #3
0
    def test_batch_with_missing_field(self):
        # smoke test to see if batches with missing attributes are shown properly
        with open(self.test_missing_field_dataset_path, "wt") as f:
            f.write("text,label\n1,0")

        dst = data.TabularDataset(path=self.test_missing_field_dataset_path,
                                  format="csv",
                                  skip_header=True,
                                  fields=[("text",
                                           data.Field(use_vocab=False,
                                                      sequential=False)),
                                          ("label", None)])
        itr = data.Iterator(dst, batch_size=64)
        str(next(itr.__iter__()))
Beispiel #4
0
    def test_numericalize_basic(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [
            ["When", "do", "you", "use", "シ", "instead", "of", "し?"],
            ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"],
            ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]
        ]

        # Test default
        default_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field, test_example_data,
                                     default_numericalized)
Beispiel #5
0
    def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos
Beispiel #6
0
 def test_errors(self):
     # Test that passing a non-tuple (of data and length) to numericalize
     # with Field.include_lengths = True raises an error.
     with self.assertRaises(ValueError):
         self.write_test_ppid_dataset(data_format="tsv")
         question_field = data.Field(sequential=True, include_lengths=True)
         tsv_fields = [("id", None), ("q1", question_field),
                       ("q2", question_field), ("label", None)]
         tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                           format="tsv",
                                           fields=tsv_fields)
         question_field.build_vocab(tsv_dataset)
         test_example_data = [[
             "When", "do", "you", "use", "シ", "instead", "of", "し?"
         ],
                              [
                                  "What", "is", "2+2", "<pad>", "<pad>",
                                  "<pad>", "<pad>", "<pad>"
                              ],
                              [
                                  "Here", "is", "a", "sentence", "with",
                                  "some", "oovs", "<pad>"
                              ]]
         question_field.numericalize(test_example_data)
Beispiel #7
0
    def test_build_vocab(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)

        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)

        # Write JSON dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="json")
        json_fields = {
            "question1": ("q1", question_field),
            "question2": ("q2", question_field),
            "label": ("label", label_field)
        }
        json_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                           format="json",
                                           fields=json_fields)

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9,
            'you': 10,
            '"&"': 11,
            '"and"?': 12,
            '2+2': 13,
            '2+2=?': 14,
            'Abraham': 15,
            'What': 16,
            'Where': 17,
            'Which': 18,
            'is': 19,
            'location': 20,
            'し?': 21,
            'シ': 22
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9,
            'you': 10,
            '"&"': 11,
            '"and"?': 12,
            '2+2': 13,
            '2+2=?': 14,
            'Abraham': 15,
            'What': 16,
            'Where': 17,
            'Which': 18,
            'is': 19,
            'location': 20,
            'し?': 21,
            'シ': 22
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab with extra kwargs passed to Vocab
        question_field.build_vocab(tsv_dataset,
                                   json_dataset,
                                   max_size=8,
                                   min_freq=3)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos