Esempio n. 1
0
    def test_json_valid_and_invalid_nested_key(self):
        self.write_test_nested_key_json_dataset()
        valid_fields = {
            'foods.vegetables.name': ('vegs', data.Field()),
            'foods.fruits': ('fruits', data.Field())
        }
        invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}

        expected_examples = [{
            "fruits": ["Apple", "Banana"],
            "vegs": ["Broccoli", "Cabbage"]
        }, {
            "fruits": ["Cherry", "Grape", "Lemon"],
            "vegs": ["Cucumber", "Lettuce"]
        }, {
            "fruits": ["Orange", "Pear", "Strawberry"],
            "vegs": ["Marrow", "Spinach"]
        }]
        dataset = data.TabularDataset(
            path=self.test_nested_key_json_dataset_path,
            format="json",
            fields=valid_fields)
        # check results
        for example, expect in zip(dataset.examples, expected_examples):
            self.assertEqual(example.vegs, expect['vegs'])
            self.assertEqual(example.fruits, expect['fruits'])

        with self.assertRaises(ValueError):
            data.TabularDataset(path=self.test_nested_key_json_dataset_path,
                                format="json",
                                fields=invalid_fields)
Esempio n. 2
0
    def test_serialization_built_vocab(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        question_field.build_vocab(tsv_dataset)

        question_pickle_filename = "question.pl"
        question_pickle_path = os.path.join(self.test_dir, question_pickle_filename)
        torch.save(question_field, question_pickle_path)

        loaded_question_field = torch.load(question_pickle_path)

        assert loaded_question_field == question_field

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test results of numericalization
        original_numericalization = question_field.numericalize(test_example_data)
        pickled_numericalization = loaded_question_field.numericalize(test_example_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Esempio n. 3
0
    def get_loaders(self, config):
        train, valid = data.TabularDataset(
            path=config.file_path,
            format='tsv',
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=config.train_ratio)

        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=config.batch_size,
            device='cuda:{}'.format(config.gpu_id)
            if config.gpu_id >= 0 else 'cpu',
            shuffle=True,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True)

        self.label.build_vocab(train)
        self.text.build_vocab(train,
                              max_size=self.max_vocab,
                              min_freq=self.min_freq)

        return self.train_loader, self.valid_loader
Esempio n. 4
0
    def test_batch_iter(self):
        self.write_test_numerical_features_dataset()
        FLOAT = data.Field(use_vocab=False,
                           sequential=False,
                           dtype=torch.float)
        INT = data.Field(use_vocab=False, sequential=False, is_target=True)
        TEXT = data.Field(sequential=False)

        dst = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            skip_header=False,
            fields=[("float", FLOAT), ("int", INT), ("text", TEXT)])
        TEXT.build_vocab(dst)
        itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
        fld_order = [
            k for k, v in dst.fields.items()
            if v is not None and not v.is_target
        ]
        batch = next(iter(itr))
        (x1, x2), y = batch
        x = (x1, x2)[fld_order.index("float")]
        self.assertEquals(y.data[0], 1)
        self.assertEquals(y.data[1], 12)
        self.assertAlmostEqual(x.data[0], 0.1, places=4)
        self.assertAlmostEqual(x.data[1], 0.5, places=4)
Esempio n. 5
0
    def test_numericalize_stop_words(self):
        # Based on request from #354
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True,
                                    stop_words=set(["do", "you"]))
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = question_field.pad(
            [question_field.preprocess(x) for x in
             [["When", "do", "you", "use", "シ",
               "instead", "of", "し?"],
              ["What", "is", "2+2", "<pad>", "<pad>",
               "<pad>", "<pad>", "<pad>"],
              ["Here", "is", "a", "sentence", "with",
               "some", "oovs", "<pad>"]]]
        )

        # Test with batch_first
        stopwords_removed_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     stopwords_removed_numericalized,
                                     batch_first=True)
Esempio n. 6
0
    def test_csv_dataset_quotechar(self):
        # Based on issue #349
        example_data = [("text", "label"), ('" hello world', "0"),
                        ('goodbye " world', "1"), ('this is a pen " ', "0")]

        with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
            for example in example_data:
                f.write("{}\n".format(",".join(example)).encode("latin-1"))

            TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
            fields = {
                "label": ("label", data.Field(use_vocab=False,
                                              sequential=False)),
                "text": ("text", TEXT)
            }

            f.seek(0)

            dataset = data.TabularDataset(
                path=f.name,
                format="csv",
                skip_header=False,
                fields=fields,
                csv_reader_params={"quotechar": None})

            TEXT.build_vocab(dataset)

            self.assertEqual(len(dataset), len(example_data) - 1)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_data[i + 1][0].lower().split())
                self.assertEqual(example.label, example_data[i + 1][1])
Esempio n. 7
0
    def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths))
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths)
Esempio n. 8
0
    def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        reversed_test_example_data = [list(reversed(sentence)) for sentence in
                                      test_example_data]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data))
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized)
Esempio n. 9
0
    def test_csv_file_with_header(self):
        example_with_header = [("text", "label"), ("HELLO WORLD", "0"),
                               ("goodbye world", "1")]

        TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
        fields = {
            "label": ("label", data.Field(use_vocab=False, sequential=False)),
            "text": ("text", TEXT)
        }

        for format_, delim in zip(["csv", "tsv"], [",", "\t"]):
            with open(self.test_has_header_dataset_path, "wt") as f:
                for line in example_with_header:
                    f.write("{}\n".format(delim.join(line)))

            # check that an error is raised here if a non-existent field is specified
            with self.assertRaises(ValueError):
                data.TabularDataset(
                    path=self.test_has_header_dataset_path,
                    format=format_,
                    fields={"non_existent": ("label", data.Field())})

            dataset = data.TabularDataset(
                path=self.test_has_header_dataset_path,
                format=format_,
                skip_header=False,
                fields=fields)

            TEXT.build_vocab(dataset)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_with_header[i + 1][0].lower().split())
                self.assertEqual(example.label, example_with_header[i + 1][1])

            # check that the vocabulary is built correctly (#225)
            expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0}
            for k, v in expected_freqs.items():
                self.assertEqual(TEXT.vocab.freqs[k], v)

            data_iter = data.Iterator(dataset,
                                      batch_size=1,
                                      sort_within_batch=False,
                                      repeat=False)
            next(data_iter.__iter__())
Esempio n. 10
0
def load_dataset(config,
                 train_pos='train.hh',
                 train_neg='train.fb',
                 dev_pos='dev.hh',
                 dev_neg='dev.fb',
                 test_pos='test.hh',
                 test_neg='test.fb'):
    logger = logging.getLogger(__name__)
    root = config.data_path
    TEXT = data.Field(batch_first=True, eos_token='<eos>')

    dataset_fn = lambda name: data.TabularDataset(
        path=root + name, format='tsv', fields=[('text', TEXT)])

    train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
    dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
    test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])

    TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)

    if config.load_pretrained_embed:
        start = time.time()

        vectors = torchtext.vocab.GloVe('6B',
                                        dim=config.embed_size,
                                        cache=config.pretrained_embed_path)
        TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        logger.info('vectors', TEXT.vocab.vectors.size())

        logger.info('load embedding took {:.2f} s.'.format(time.time() -
                                                           start))

    vocab = TEXT.vocab

    dataiter_fn = lambda dataset, train: data.BucketIterator(
        dataset=dataset,
        batch_size=config.batch_size,
        shuffle=train,
        repeat=train,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device=config.device)

    train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True),
                                         [train_pos_set, train_neg_set])
    dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False),
                                     [dev_pos_set, dev_neg_set])
    test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False),
                                       [test_pos_set, test_neg_set])

    train_iters = DatasetIterator(train_pos_iter, train_neg_iter)
    dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter)
    test_iters = DatasetIterator(test_pos_iter, test_neg_iter)

    return train_iters, dev_iters, test_iters, vocab
Esempio n. 11
0
    def test_tabular_simple_data(self):
        for data_format in ["csv", "tsv", "json"]:
            self.write_test_ppid_dataset(data_format=data_format)

            if data_format == "json":
                question_field = data.Field(sequential=True)
                label_field = data.Field(sequential=False)
                fields = {
                    "question1": ("q1", question_field),
                    "question2": ("q2", question_field),
                    "label": ("label", label_field)
                }
            else:
                question_field = data.Field(sequential=True)
                label_field = data.Field(sequential=False)
                fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", label_field)]

            dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format=data_format,
                                          fields=fields)

            assert len(dataset) == 3

            expected_examples = [
                (["When", "do", "you", "use", "シ", "instead", "of", "し?"], [
                    "When", "do", "you", "use", "\"&\"", "instead", "of",
                    "\"and\"?"
                ], "0"),
                (["Where", "was", "Lincoln", "born?"],
                 ["Which", "location", "was", "Abraham", "Lincoln",
                  "born?"], "1"), (["What", "is", "2+2"], ["2+2=?"], "1")
            ]

            # Ensure examples have correct contents / test __getitem__
            for i in range(len(dataset)):
                self.assertEqual(dataset[i].q1, expected_examples[i][0])
                self.assertEqual(dataset[i].q2, expected_examples[i][1])
                self.assertEqual(dataset[i].label, expected_examples[i][2])

            # Test __getattr__
            for i, (q1, q2, label) in enumerate(
                    zip(dataset.q1, dataset.q2, dataset.label)):
                self.assertEqual(q1, expected_examples[i][0])
                self.assertEqual(q2, expected_examples[i][1])
                self.assertEqual(label, expected_examples[i][2])

            # Test __iter__
            for i, example in enumerate(dataset):
                self.assertEqual(example.q1, expected_examples[i][0])
                self.assertEqual(example.q2, expected_examples[i][1])
                self.assertEqual(example.label, expected_examples[i][2])
Esempio n. 12
0
def create_dataset(opt, SRC, TRG):

    print("creating dataset and iterator... ")

    raw_data = {
        'src': [line for line in opt.src_data],
        'trg': [line for line in opt.trg_data]
    }
    df = pd.DataFrame(raw_data, columns=["src", "trg"])

    mask = (df['src'].str.count(' ') <
            opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]

    df.to_csv("translate_transformer_temp.csv", index=False)

    data_fields = [('src', SRC), ('trg', TRG)]
    train = data.TabularDataset('./translate_transformer_temp.csv',
                                format='csv',
                                fields=data_fields)

    train_iter = MyIterator(train,
                            batch_size=opt.batchsize,
                            device=opt.device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True,
                            shuffle=True)

    os.remove('translate_transformer_temp.csv')

    if opt.load_weights is None:
        SRC.build_vocab(train)
        TRG.build_vocab(train)
        if opt.checkpoint > 0:
            try:
                os.mkdir("weights")
            except:
                print(
                    "weights folder already exists, run program with -load_weights weights to load them"
                )
                quit()
            pickle.dump(SRC, open('weights/SRC.pkl', 'wb'))
            pickle.dump(TRG, open('weights/TRG.pkl', 'wb'))

    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']

    opt.train_len = get_len(train_iter)

    return train_iter
Esempio n. 13
0
    def test_numerical_features_no_vocab(self):
        self.write_test_numerical_features_dataset()
        # Test basic usage
        int_field = data.Field(sequential=False, use_vocab=False)
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float)
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2])

        # Test with postprocessing applied
        int_field = data.Field(sequential=False, use_vocab=False,
                               postprocessing=lambda arr, _: [x + 1 for x in arr])
        float_field = data.Field(sequential=False, use_vocab=False,
                                 dtype=torch.float,
                                 postprocessing=lambda arr, _: [x * 0.5 for x in arr])
        tsv_fields = [("int", int_field), ("float", float_field), ("string", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_numerical_features_dataset_path, format="tsv",
            fields=tsv_fields)
        int_field.build_vocab(tsv_dataset)
        float_field.build_vocab(tsv_dataset)
        test_int_data = ["1", "0", "1", "3", "19"]
        test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"]

        numericalized_int = int_field.numericalize(test_int_data)
        self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20])
        numericalized_float = float_field.numericalize(test_float_data)
        self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1])
Esempio n. 14
0
    def test_batch_with_missing_field(self):
        # smoke test to see if batches with missing attributes are shown properly
        with open(self.test_missing_field_dataset_path, "wt") as f:
            f.write("text,label\n1,0")

        dst = data.TabularDataset(path=self.test_missing_field_dataset_path,
                                  format="csv",
                                  skip_header=True,
                                  fields=[("text",
                                           data.Field(use_vocab=False,
                                                      sequential=False)),
                                          ("label", None)])
        itr = data.Iterator(dst, batch_size=64)
        str(next(itr.__iter__()))
Esempio n. 15
0
    def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {
            "qeustion1": ("q1", question_field),
            "question2": ("q2", question_field),
            "label": ("label", label_field)
        }

        with self.assertRaises(ValueError):
            data.TabularDataset(path=self.test_ppid_dataset_path,
                                format="json",
                                fields=fields)
Esempio n. 16
0
def vocabulary_generator():
    tweet = data.Field(sequential=True,
                       tokenize='spacy',
                       tokenizer_language='en_core_web_sm',
                       include_lengths=True)
    target = data.Field(sequential=False, use_vocab=False)
    fields = {'Tweets': ('t', tweet), 'Target': ('s', target)}
    train_data = data.TabularDataset(path="./clean_train_csv.csv",
                                     format="csv",
                                     fields=fields)
    tweet.build_vocab(train_data,
                      max_size=10000,
                      vectors="glove.6B.100d",
                      unk_init=torch.Tensor.normal_,
                      min_freq=1)
    with open("./TEXT.Field", "wb") as f:
        dill.dump(tweet, f)
    with open("./TEST.Field", "wb") as f:
        dill.dump(target, f)
def test_intent():
    config = tm.Config()

    text_field = data.Field(lower=True, tokenize=tokenize)
    label_field = data.Field(sequential=False)
    fields = [('text', text_field), ('label', label_field)]

    train_dataset, val_dataset = data.TabularDataset.splits(
        path='./',
        format='csv',
        skip_header=True,
        train=train_data_path,
        test=val_data_path,
        fields=fields)
    vectors = Vectors(name="./model/word2vec")
    text_field.build_vocab(train_dataset,
                           val_dataset,
                           min_freq=1,
                           vectors=vectors)

    label_field.build_vocab(train_dataset, val_dataset)

    test_dataset = data.TabularDataset(path=test_data_path,
                                       format='csv',
                                       fields=fields,
                                       skip_header=True)
    test_iter = data.Iterator(test_dataset,
                              batch_size=config.batch_size,
                              sort_key=lambda x: len(x.text))

    print('Loading model from {}...'.format(config.snapshot))
    embed_num = len(text_field.vocab)
    class_num = len(label_field.vocab) - 1
    kernel_sizes = [int(k) for k in config.kernel_sizes.split(',')]

    config.snapshot = './model/snapshot/best_steps_200.pt'

    cnn = tm.TextCnn(embed_num, config.embed_dim, class_num, config.kernel_num,
                     kernel_sizes, config.dropout)
    cnn.load_state_dict(tm.torch.load(config.snapshot))

    summary_predict(cnn, text_field, label_field)
Esempio n. 18
0
 def test_errors(self):
     # Test that passing a non-tuple (of data and length) to numericalize
     # with Field.include_lengths = True raises an error.
     with self.assertRaises(ValueError):
         self.write_test_ppid_dataset(data_format="tsv")
         question_field = data.Field(sequential=True, include_lengths=True)
         tsv_fields = [("id", None), ("q1", question_field),
                       ("q2", question_field), ("label", None)]
         tsv_dataset = data.TabularDataset(
             path=self.test_ppid_dataset_path, format="tsv",
             fields=tsv_fields)
         question_field.build_vocab(tsv_dataset)
         test_example_data = [["When", "do", "you", "use", "シ",
                               "instead", "of", "し?"],
                              ["What", "is", "2+2", "<pad>", "<pad>",
                               "<pad>", "<pad>", "<pad>"],
                              ["Here", "is", "a", "sentence", "with",
                               "some", "oovs", "<pad>"]]
         question_field.numericalize(
             test_example_data)
Esempio n. 19
0
    def test_input_with_newlines_in_text(self):
        # Smoke test for ensuring that TabularDataset works with files with newlines
        example_with_newlines = [("\"hello \n world\"", "1"),
                                 ("\"there is a \n newline\"", "0"),
                                 ("\"there is no newline\"", "1")]
        fields = [("text", data.Field(lower=True)),
                  ("label", data.Field(sequential=False))]

        for delim in [",", "\t"]:
            with open(self.test_newline_dataset_path, "wt") as f:
                for line in example_with_newlines:
                    f.write("{}\n".format(delim.join(line)))

            format_ = "csv" if delim == "," else "tsv"
            dataset = data.TabularDataset(path=self.test_newline_dataset_path,
                                          format=format_,
                                          fields=fields)
            # if the newline is not parsed correctly, this should raise an error
            for example in dataset:
                self.assertTrue(hasattr(example, "text"))
                self.assertTrue(hasattr(example, "label"))
Esempio n. 20
0
    def test_vocab_size(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.LabelField()

        # Copied from test_build_vocab with minor changes
        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Skipping json dataset as we can rely on the original build vocab test
        label_field.build_vocab(tsv_dataset)
        assert label_field.vocab.freqs == Counter({'1': 2, '0': 1})
        expected_stoi = {'1': 0, '0': 1}  # No <unk>
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos
Esempio n. 21
0
def generate_best_model(df_train, df_valid):
    tsv_train = pd.DataFrame()
    tsv_train['text'] = df_train['text']
    tsv_train['label'] = df_train['class']
    tsv_train['metadata'] = df_train['metadata']
    tsv_train.to_csv('train.tsv', sep='\t', index=False)

    tsv_valid = pd.DataFrame()
    tsv_valid['text'] = df_valid['text']
    tsv_valid['label'] = df_valid['class']
    tsv_valid['metadata'] = df_valid['metadata']
    tsv_valid.to_csv('valid.tsv', sep='\t', index=False)

    SEED = 1234
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    train_data = data.TabularDataset(path='train.tsv',
                                     format='tsv',
                                     fields=fields,
                                     skip_header=True)
    valid_data = data.TabularDataset(path='valid.tsv',
                                     format='tsv',
                                     fields=fields,
                                     skip_header=True)

    MAX_VOCAB_SIZE = 25_000
    TEXT.build_vocab(train_data,
                     max_size=MAX_VOCAB_SIZE,
                     vectors=vectors,
                     unk_init=torch.Tensor.normal_)
    LABEL.build_vocab(train_data)
    BATCH_SIZE = 32
    train_iterator, valid_iterator = data.BucketIterator.splits(
        (train_data, valid_data),
        batch_size=BATCH_SIZE,
        device=device,
        sort=False)

    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    N_FILTERS = 100
    FILTER_SIZES = [2, 3, 4]
    OUTPUT_DIM = len(LABEL.vocab)
    DROPOUT = 0.5
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
    model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM,
                DROPOUT)
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)
    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()
    model = model.to(device)
    criterion = criterion.to(device)

    def categorical_accuracy(preds, y):
        top_pred = preds.argmax(1, keepdim=True)
        correct = top_pred.eq(y.view_as(top_pred)).sum()
        acc = correct.float() / y.shape[0]
        return acc

    def train(model, iterator, optimizer, criterion):
        epoch_loss = 0
        epoch_acc = 0
        model.train()

        for batch in tqdm(iterator):
            optimizer.zero_grad()
            predictions = model(batch.text, batch.metadata)
            loss = criterion(predictions, batch.label)
            acc = categorical_accuracy(predictions, batch.label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def evaluate(model, iterator, criterion):
        epoch_loss = 0
        epoch_acc = 0
        model.eval()

        with torch.no_grad():
            for batch in iterator:
                predictions = model(batch.text, batch.metadata)
                loss = criterion(predictions, batch.label)
                acc = categorical_accuracy(predictions, batch.label)
                epoch_loss += loss.item()
                epoch_acc += acc.item()

        return epoch_loss / len(iterator), epoch_acc / len(iterator)

    def epoch_time(start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_mins = int(elapsed_time / 60)
        elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
        return elapsed_mins, elapsed_secs

    N_EPOCHS = 10
    best_valid_loss = float('inf')
    for epoch in range(N_EPOCHS):
        start_time = time.time()
        train_loss, train_acc = train(model, train_iterator, optimizer,
                                      criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'model.pt')

        print(
            f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(
            f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'
        )
        print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%'
        )
    model.load_state_dict(torch.load('model.pt'))
    return model, TEXT.vocab, LABEL.vocab.itos
    def __init__(
        self, root, 
        train_path, test_path, predict_path,
        batch_size=64,
        valid_ratio=.2,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True,
        rm = re.compile('[:;\'\"\[\]\(\)\.,@]') #제거할 특수문자
    ):
        super().__init__()
        # 전처리는 여기서 진행한다. 
        # Data Field 정의
        self.id = data.Field( # 학습에 쓰지 않을 column
            sequential=False, 
            use_vocab=False,
            unk_token=None
        )
        self.text = data.Field( 
            use_vocab=True,
            tokenize=word_tokenize,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None
        )
        self.label = data.Field(
            sequential=False, # 0 or 1
            use_vocab=False,
            unk_token=None,
            is_target=True
        )
        
        # 데이터 읽어오기
        # ratings_train.txt : train+valid
        train, valid = data.TabularDataset(
            path = root + train_path,
            format ='tsv',
            fields = [
                ('id', self.id),
                ('text', self.text),
                ('label', self.label)],
            skip_header=True
        ).split(split_ratio=(1 - valid_ratio))

        # ratings_test.txt : test
        test = data.TabularDataset(
            path = root + test_path,
            format='tsv',
            fields=[
                ('id', self.id),
                ('text', self.text),
                ('label', self.label)],
            skip_header=True
        )

        # ko_data.csv : Kaggle commit
        predict = data.TabularDataset(
            path = root + predict_path,
            format='csv',
            fields=[
                ('id', self.id),
                ('text', self.text)],
            skip_header=True
        )

        # Batchify (Dataloader에 올리기)
        # train+valid loader
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=shuffle,
            sort_key=lambda x: len(x.text), # 길이로 sort 후 batch 나눔!
            sort_within_batch=True, # 미니 배치 내에서 sort
        )

        # test_loader
        self.test_loader = data.BucketIterator(
            test,
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=False,
        )

        # predict_loader
        self.predict_loader = data.BucketIterator(
            predict,
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=False
        )

        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq) # vocabulary set build
fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt'
fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt'
fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt'

# fpTest = fopRoot + 'test.csv'
# fpTextTest = fopRoot + 'testW.text.txt'
fpLabelP1Test = fopOutputML + 'testW.label.p1.txt'
fpLabelP2Test = fopOutputML + 'testW.label.p2.txt'
fpLabelP3Test = fopOutputML + 'testW.label.p3.txt'
sys.stdout = open(fpResultDetails, 'w')
TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True)
fields = [('label', LABEL), ('text', TEXT)]
# loading custom dataset p1
train_data = data.TabularDataset(path=fpLabelP1Train,
                                 format='csv',
                                 fields=fields,
                                 skip_header=True)
valid_data = data.TabularDataset(path=fpLabelP1Valid,
                                 format='csv',
                                 fields=fields,
                                 skip_header=True)
test_data = data.TabularDataset(path=fpLabelP1Test,
                                format='csv',
                                fields=fields,
                                skip_header=True)
acc_p1 = trainAndEval(train_data, valid_data, test_data)

TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True)
fields = [('label', LABEL), ('text', TEXT)]
# loading custom dataset p2
Esempio n. 24
0
def main():
    print("Using device: {}" "\n".format(str(device)))

    # Load the training dataset, and create a dataloader to generate a batch.
    textField = data.Field(lower=True,
                           include_lengths=True,
                           batch_first=True,
                           tokenize=student.tokenise,
                           preprocessing=student.preprocessing,
                           postprocessing=student.postprocessing,
                           stop_words=student.stopWords)
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset(
        'train.json', 'json', {
            'reviewText': ('reviewText', textField),
            'rating': ('rating', labelField),
            'businessCategory': ('businessCategory', labelField)
        })

    textField.build_vocab(dataset, vectors=student.wordVectors)

    # Allow training on the entire dataset, or split it for training and validation.
    if student.trainValSplit == 1:
        trainLoader = data.BucketIterator(dataset,
                                          shuffle=True,
                                          batch_size=student.batchSize,
                                          sort_key=lambda x: len(x.reviewText),
                                          sort_within_batch=True)
    else:
        train, validate = dataset.split(split_ratio=student.trainValSplit)

        trainLoader, valLoader = data.BucketIterator.splits(
            (train, validate),
            shuffle=True,
            batch_size=student.batchSize,
            sort_key=lambda x: len(x.reviewText),
            sort_within_batch=True)

    # Get model and optimiser from student.
    net = student.net.to(device)
    lossFunc = student.lossFunc
    optimiser = student.optimiser

    # Train.
    for epoch in range(student.epochs):
        runningLoss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs = textField.vocab.vectors[batch.reviewText[0]].to(device)
            length = batch.reviewText[1].to(device)
            rating = batch.rating.to(device)
            businessCategory = batch.businessCategory.to(device)

            # PyTorch calculates gradients by accumulating contributions to them
            # (useful for RNNs).  Hence we must manually set them to zero before
            # calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            ratingOutput, categoryOutput = net(inputs, length)
            loss = lossFunc(ratingOutput, categoryOutput, rating,
                            businessCategory)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            runningLoss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, runningLoss / 32))
                runningLoss = 0

    # Save model.
    torch.save(net.state_dict(), 'savedModel.pth')
    print("\n" "Model saved to savedModel.pth")

    # Test on validation data if it exists.
    if student.trainValSplit != 1:
        net.eval()

        correctRatingOnlySum = 0
        correctCategoryOnlySum = 0
        bothCorrectSum = 0
        with torch.no_grad():
            for batch in valLoader:
                # Get a batch and potentially send it to GPU memory.
                inputs = textField.vocab.vectors[batch.reviewText[0]].to(
                    device)
                length = batch.reviewText[1].to(device)
                rating = batch.rating.to(device)
                businessCategory = batch.businessCategory.to(device)

                # Convert network output to integer values.
                ratingOutputs, categoryOutputs = student.convertNetOutput(
                    *net(inputs, length))

                # Calculate performance
                correctRating = rating == ratingOutputs.flatten()
                correctCategory = businessCategory == categoryOutputs.flatten()

                correctRatingOnlySum += torch.sum(correctRating
                                                  & ~correctCategory).item()
                correctCategoryOnlySum += torch.sum(correctCategory
                                                    & ~correctRating).item()
                bothCorrectSum += torch.sum(correctRating
                                            & correctCategory).item()

        correctRatingOnlyPercent = correctRatingOnlySum / len(validate)
        correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate)
        bothCorrectPercent = bothCorrectSum / len(validate)
        neitherCorrectPer = 1 - correctRatingOnlyPercent \
                              - correctCategoryOnlyPercent \
                              - bothCorrectPercent

        score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent +
                       0.1 * correctRatingOnlyPercent)

        print("\n"
              "Rating incorrect, business category incorrect: {:.2%}\n"
              "Rating correct, business category incorrect: {:.2%}\n"
              "Rating incorrect, business category correct: {:.2%}\n"
              "Rating correct, business category correct: {:.2%}\n"
              "\n"
              "Weighted score: {:.2f}".format(neitherCorrectPer,
                                              correctRatingOnlyPercent,
                                              correctCategoryOnlyPercent,
                                              bothCorrectPercent, score))
Esempio n. 25
0
def caption_iterator(cfg, batch_size, phase):
    print(f'Contructing caption_iterator for "{phase}" phase')
    spacy_en = spacy.load('en')

    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]

    CAPTION = data.ReversibleField(tokenize='spacy',
                                   init_token=cfg.start_token,
                                   eos_token=cfg.end_token,
                                   pad_token=cfg.pad_token,
                                   lower=True,
                                   batch_first=True,
                                   is_target=True)
    INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)

    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=cfg.train_meta_path,
        format='tsv',
        skip_header=True,
        fields=fields,
    )
    CAPTION.build_vocab(dataset.caption,
                        min_freq=cfg.min_freq_caps,
                        vectors=cfg.word_emb_caps)
    train_vocab = CAPTION.vocab

    if phase == 'val_1':
        dataset = data.TabularDataset(path=cfg.val_1_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'val_2':
        dataset = data.TabularDataset(path=cfg.val_2_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'learned_props':
        dataset = data.TabularDataset(path=cfg.val_prop_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)

    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption))
    datasetloader = data.BucketIterator(dataset,
                                        batch_size,
                                        sort_key=lambda x: 0,
                                        device=torch.device(cfg.device),
                                        repeat=False,
                                        shuffle=True)
    return train_vocab, datasetloader
Esempio n. 26
0
    def test_build_vocab(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)

        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        # Write JSON dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="json")
        json_fields = {"question1": ("q1", question_field),
                       "question2": ("q2", question_field),
                       "label": ("label", label_field)}
        json_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="json",
            fields=json_fields)

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset, specials=['<space>'])
        assert question_field.vocab.freqs == Counter(
            {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4,
             'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2,
             'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2,
             '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2,
             'Abraham': 2, '2+2=?': 2})
        expected_stoi = {'<unk>': 0, '<pad>': 1, '<space>': 2,
                         'Lincoln': 3, 'When': 4,
                         'born?': 5, 'do': 6, 'instead': 7, 'of': 8,
                         'use': 9, 'was': 10, 'you': 11, '"&"': 12,
                         '"and"?': 13, '2+2': 14, '2+2=?': 15, 'Abraham': 16,
                         'What': 17, 'Where': 18, 'Which': 19, 'is': 20,
                         'location': 21, 'し?': 22, 'シ': 23}
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter(
            {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4,
             'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2,
             'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2,
             '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2,
             'Abraham': 2, '2+2=?': 2})
        expected_stoi = {'<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3,
                         'born?': 4, 'do': 5, 'instead': 6, 'of': 7,
                         'use': 8, 'was': 9, 'you': 10, '"&"': 11,
                         '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15,
                         'What': 16, 'Where': 17, 'Which': 18, 'is': 19,
                         'location': 20, 'し?': 21, 'シ': 22}
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab with extra kwargs passed to Vocab
        question_field.build_vocab(tsv_dataset, json_dataset, max_size=8,
                                   min_freq=3)
        assert question_field.vocab.freqs == Counter(
            {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4,
             'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2,
             'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2,
             '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2,
             'Abraham': 2, '2+2=?': 2})
        expected_stoi = {'<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3,
                         'born?': 4, 'do': 5, 'instead': 6, 'of': 7,
                         'use': 8, 'was': 9}
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [x[0] for x in sorted(expected_stoi.items(),
                                              key=lambda tup: tup[1])]
        assert question_field.vocab.itos == expected_itos
Esempio n. 27
0
    def test_stratified_dataset_split(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Default split ratio
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split(stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test strata_field argument
        train, test = dataset.split(split_ratio=split_ratio,
                                    stratified=True,
                                    strata_field='label')
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test invalid field name
        strata_field = 'dummy'
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio,
                          stratified=True,
                          strata_field=strata_field)

        # Test uneven stratify sizes
        num_examples, num_labels = 28, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        # 10 examples for class 1 and 9 examples for classes 2,3
        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        expected_train_size = 7 + 6 + 6
        expected_test_size = 3 + 3 + 3
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio, stratified=True)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 6 + 5 + 5
        expected_valid_size = 1 + 1 + 1
        expected_test_size = 3 + 3 + 3
        train, valid, test = dataset.split(split_ratio=split_ratio,
                                           stratified=True)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size
Esempio n. 28
0
    def test_dataset_split_arguments(self):
        num_examples, num_labels = 30, 3
        self.write_test_splitting_dataset(num_examples=num_examples,
                                          num_labels=num_labels)
        text_field = data.Field()
        label_field = data.LabelField()
        fields = [('text', text_field), ('label', label_field)]

        dataset = data.TabularDataset(path=self.test_dataset_splitting_path,
                                      format="csv",
                                      fields=fields)

        # Test default split ratio (0.7)
        expected_train_size = 21
        expected_test_size = 9

        train, test = dataset.split()
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Test array arguments with same ratio
        split_ratio = [0.7, 0.3]
        train, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(test) == expected_test_size

        # Add validation set
        split_ratio = [0.6, 0.3, 0.1]
        expected_train_size = 18
        expected_valid_size = 3
        expected_test_size = 9

        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test ratio normalization
        split_ratio = [6, 3, 1]
        train, valid, test = dataset.split(split_ratio=split_ratio)
        assert len(train) == expected_train_size
        assert len(valid) == expected_valid_size
        assert len(test) == expected_test_size

        # Test only two splits returned for too small valid split size
        split_ratio = [0.66, 0.33, 0.01]
        expected_length = 2
        splits = dataset.split(split_ratio=split_ratio)
        assert len(splits) == expected_length

        # Test invalid arguments
        split_ratio = 1.1
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = -1.
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [0.7]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = [1, 2, 3, 4]
        with pytest.raises(AssertionError):
            dataset.split(split_ratio=split_ratio)

        split_ratio = "string"
        with pytest.raises(ValueError):
            dataset.split(split_ratio=split_ratio)
Esempio n. 29
0
# use_vocab: 是否使用Vocab,否则Field的对象是数字类型的
# pad_token: 用于填充文本的关键字
# unk_token: 用于填充不在词汇表中的关键字
TEXT = data.Field(sequential=True,
                  tokenize=customize_tokensize,
                  include_lengths=True,
                  use_vocab=True,
                  batch_first=True,
                  fix_length=200)
LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   pad_token=None,
                   unk_token=None)
train_fileds = [("text", TEXT), ("label", LABEL)]
train_data = data.TabularDataset(path=r"./imdb_data.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=train_fileds)
train_data_real, val_data_real = train_data.split(split_ratio=0.7)
vec = Vectors("glove.6B.100d.txt", "./Emotion")
# 将训练集转换为词向量
TEXT.build_vocab(train_data_real, max_size=20000, vectors=vec)
LABEL.build_vocab(train_data_real)
# print(TEXT.vocab.freqs.most_common(n=10))
# print("类别标签情况: ", LABEL.vocab.freqs)
# print("词典个数: ", len(TEXT.vocab.itos))

# 定义加载器
train_iter = data.BucketIterator(train_data_real, batch_size=BATCH_SIZE)
val_iter = data.BucketIterator(val_data_real, batch_size=BATCH_SIZE)

INPUT_DIM = len(TEXT.vocab)  # 词典数量
Esempio n. 30
0
from torchtext.legacy import data
from tqdm import tqdm
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, confusion_matrix

device = "cpu"

# dataset
LABEL = data.LabelField()
POST = data.Field(tokenize="spacy",
                  lower=True,
                  tokenizer_language="en_core_web_sm")
fields = [("body", POST), ("label", LABEL)]
dataset = data.TabularDataset(path="pytorch_data.csv",
                              format="CSV",
                              fields=fields)
train, test = dataset.split(split_ratio=[0.8, 0.2])

# vocabulary
POST.build_vocab(train, max_size=10000)  # , vectors = 'glove.6B.200d')
LABEL.build_vocab(train)  # fixes `"LabelField" has no attribute "vocab"`

# data loaders
train_iterator, test_iterator = data.BucketIterator.splits(
    (train, test),
    batch_size=32,
    device=device,
    sort_key=lambda x: x.body,  # fixes weird error
    sort_within_batch=True,  # fixes weird error
)