Example #1
0
    def __init__(self, config, tokenize_src, tokenize_trg, device):
        self.config = config
        self.device = device
        self.SRC = data.Field(tokenize=tokenize_src,
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True)
        self.TRG = data.Field(
            tokenize=tokenize_trg,
            init_token='<sos>',
            eos_token='<eos>',
            pad_token='<pad>',
            lower=True,
            batch_first=True,
        )
        self.train_data, self.valid_data, self.test_data = Multi30k.splits(
            exts=(config['src_ext'], config['trg_ext']),
            fields=(self.SRC, self.TRG))
        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data),
            batch_size=self.config['batch_size'],
            device=self.device)
Example #2
0
    def load_line_as_data(self, line, level, lowercase, src_vocab, trg_vocab):
        tok_fun = lambda s: list(s) if level == "char" else s.split()

        src_field = data.Field(
            init_token=None,
            eos_token=EOS_TOKEN,  # FIXME
            pad_token=PAD_TOKEN,
            tokenize=tok_fun,
            batch_first=True,
            lower=lowercase,
            unk_token=UNK_TOKEN,
            include_lengths=True)
        trg_field = data.Field(init_token=BOS_TOKEN,
                               eos_token=EOS_TOKEN,
                               pad_token=PAD_TOKEN,
                               tokenize=tok_fun,
                               unk_token=UNK_TOKEN,
                               batch_first=True,
                               lower=lowercase,
                               include_lengths=True)

        test_data = MonoLineDataset(line=line, field=src_field)
        src_field.vocab = src_vocab
        trg_field.vocab = trg_vocab

        return test_data, src_vocab, trg_vocab
Example #3
0
def get_data():
    TEXT = data.Field(lower=True)
    UD_TAGS = data.Field(unk_token=None)
    PTB_TAGS = data.Field(unk_token=None)

    fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS))
    train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

    print(f"Number of training examples: {len(train_data)}")
    print(f"Number of validation examples: {len(valid_data)}")
    print(f"Number of testing examples: {len(test_data)}")

    MIN_FREQ = 2

    TEXT.build_vocab(train_data,
                     min_freq=MIN_FREQ,
                     vectors="glove.6B.100d",
                     unk_init=torch.Tensor.normal_)

    UD_TAGS.build_vocab(train_data)
    PTB_TAGS.build_vocab(train_data)

    print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
    print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}")
    print(f"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}")

    return TEXT, PTB_TAGS, train_data, test_data, valid_data
Example #4
0
    def test_csv_dataset_quotechar(self):
        # Based on issue #349
        example_data = [("text", "label"), ('" hello world', "0"),
                        ('goodbye " world', "1"), ('this is a pen " ', "0")]

        with tempfile.NamedTemporaryFile(dir=self.test_dir) as f:
            for example in example_data:
                f.write("{}\n".format(",".join(example)).encode("latin-1"))

            TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
            fields = {
                "label": ("label", data.Field(use_vocab=False,
                                              sequential=False)),
                "text": ("text", TEXT)
            }

            f.seek(0)

            dataset = data.TabularDataset(
                path=f.name,
                format="csv",
                skip_header=False,
                fields=fields,
                csv_reader_params={"quotechar": None})

            TEXT.build_vocab(dataset)

            self.assertEqual(len(dataset), len(example_data) - 1)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_data[i + 1][0].lower().split())
                self.assertEqual(example.label, example_data[i + 1][1])
Example #5
0
    def __init__(self, config, filepath, tokenize, device):
        self.config = config
        self.device = device
        self.SRC = data.Field(tokenize=tokenize,
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True)
        self.TRG = data.Field(tokenize=tokenize,
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True)
        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=(self.SRC, self.TRG))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data),
            batch_size=self.config['batch_size'],
            device=self.device)

        self.build_vocab()
Example #6
0
    def test_pad_when_fix_length_is_not_none(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(
            nesting_field, init_token="<s>", eos_token="</s>", fix_length=3)
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ]
        ]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>",
                                 eos_token="</s>", include_lengths=True, fix_length=3)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [3, 3]
        assert words_len == [[3, 6, 3], [3, 6, 3]]
Example #7
0
    def test_json_valid_and_invalid_nested_key(self):
        self.write_test_nested_key_json_dataset()
        valid_fields = {
            'foods.vegetables.name': ('vegs', data.Field()),
            'foods.fruits': ('fruits', data.Field())
        }
        invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())}

        expected_examples = [{
            "fruits": ["Apple", "Banana"],
            "vegs": ["Broccoli", "Cabbage"]
        }, {
            "fruits": ["Cherry", "Grape", "Lemon"],
            "vegs": ["Cucumber", "Lettuce"]
        }, {
            "fruits": ["Orange", "Pear", "Strawberry"],
            "vegs": ["Marrow", "Spinach"]
        }]
        dataset = data.TabularDataset(
            path=self.test_nested_key_json_dataset_path,
            format="json",
            fields=valid_fields)
        # check results
        for example, expect in zip(dataset.examples, expected_examples):
            self.assertEqual(example.vegs, expect['vegs'])
            self.assertEqual(example.fruits, expect['fruits'])

        with self.assertRaises(ValueError):
            data.TabularDataset(path=self.test_nested_key_json_dataset_path,
                                format="json",
                                fields=invalid_fields)
Example #8
0
def create_fields(opt):
    
    spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
    if opt.src_lang not in spacy_langs:
        print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs)  
    if opt.trg_lang not in spacy_langs:
        print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs)
    
    print("loading spacy tokenizers...")
    
    t_src = tokenize(opt.src_lang)
    t_trg = tokenize(opt.trg_lang)

    TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
    SRC = data.Field(lower=True, tokenize=t_src.tokenizer)

    if opt.load_weights is not None:
        try:
            print("loading presaved fields...")
            SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb'))
            TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb'))
        except:
            print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/")
            quit()
        
    return(SRC, TRG)
 def __init__(self, max_size=999999, min_freq=1):
     super().__init__()
     self.label = data.Field(sequential=False)
     self.text = data.Field(batch_first=True,
                            unk_token='<unk>')
     self.max_size = max_size
     self.min_freq = min_freq
Example #10
0
def get_files(path, train_size, max_doc_len, seed, tokenizer):
    # including lengths makes the text var a tuple containing the tweet and its length
    Text = data.Field(preprocessing=tweet_cleanup,
                      tokenize=tokenizer,
                      batch_first=True,
                      include_lengths=True,
                      fix_length=max_doc_len,
                      lower=True)
    Label = data.Field(sequential=False,
                       use_vocab=False,
                       pad_token=None,
                       unk_token=None)

    fields = [('text', Text), ('labels', Label)]

    # builds a pytorch dataset from the given training and testing files
    train_data, test_data = data.TabularDataset.splits(
        path=path,
        train='../data/train_bin_labels.csv',
        test='../data/test_bin_labels.csv',
        format='csv',
        fields=fields,
        skip_header=True)

    train_data, val_data = train_data.split(split_ratio=train_size,
                                            random_state=random.seed(seed))
    print(f'Number of training examples: {len(train_data)}')
    print(f'Number of validation examples: {len(val_data)}')
    print(f'Number of testing examples: {len(test_data)}')
    return train_data, val_data, test_data, Text, Label
Example #11
0
    def test_batch_iter(self):
        self.write_test_numerical_features_dataset()
        FLOAT = data.Field(use_vocab=False,
                           sequential=False,
                           dtype=torch.float)
        INT = data.Field(use_vocab=False, sequential=False, is_target=True)
        TEXT = data.Field(sequential=False)

        dst = data.TabularDataset(
            path=self.test_numerical_features_dataset_path,
            format="tsv",
            skip_header=False,
            fields=[("float", FLOAT), ("int", INT), ("text", TEXT)])
        TEXT.build_vocab(dst)
        itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False)
        fld_order = [
            k for k, v in dst.fields.items()
            if v is not None and not v.is_target
        ]
        batch = next(iter(itr))
        (x1, x2), y = batch
        x = (x1, x2)[fld_order.index("float")]
        self.assertEquals(y.data[0], 1)
        self.assertEquals(y.data[1], 12)
        self.assertAlmostEqual(x.data[0], 0.1, places=4)
        self.assertAlmostEqual(x.data[1], 0.5, places=4)
Example #12
0
    def __init__(self, config, device):
        self.config = config
        self.device = device
        # corpus_separator(filepath)
        self.title = data.Field(tokenize=lambda x: x.split(' '),
                                lower=True,
                                batch_first=True,
                                include_lengths=True)
        self.label = data.Field(lower=True, batch_first=True)
        fields = [('label', self.label), ('title', self.title)]
        self.train_data, self.valid_data, self.test_data = data.TabularDataset.splits(
            path=self.config['cls_dir_path'],
            train='train_tokenized.ynat',
            validation='val_tokenized.ynat',
            test='test_tokenized.ynat',
            format='tsv',
            fields=fields)

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data),
            sort=True,
            sort_within_batch=True,
            batch_size=self.config['cls_batch_size'],
            device=self.device,
            sort_key=lambda x: len(x.title))
Example #13
0
 def __init__(self) -> None:
     self.SRC = data.Field(pad_token=Constants.PAD,
                           unk_token=Constants.UNK,
                           batch_first=True)
     self.TGT = data.Field(init_token=Constants.START,
                           eos_token=Constants.END,
                           pad_token=Constants.PAD,
                           unk_token=Constants.UNK,
                           batch_first=True)
Example #14
0
def main():
    global WORD
    WORD = data.Field(include_lengths=True,
                      batch_first=True,
                      eos_token=None,
                      init_token=None)
    LABEL = data.Field(sequential=False, batch_first=True)
    TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD))
    TREE.is_target = False
    train = ListOpsDataset(
        "data/train_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < config["train_len"],
    )
    WORD.build_vocab(train)
    LABEL.build_vocab(train)
    valid = ListOpsDataset(
        "data/test_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < 150,
    )

    train_iter = TokenBucket(train,
                             batch_size=1500,
                             device="cuda:0",
                             key=lambda x: len(x.word))
    train_iter.repeat = False
    valid_iter = data.BucketIterator(train,
                                     batch_size=50,
                                     train=False,
                                     sort=False,
                                     device="cuda:0")

    NT = 1
    T = len(WORD.vocab)
    V = T

    if True:
        tree_lstm = TreeLSTM(config["H"],
                             len(WORD.vocab) + 100, len(LABEL.vocab)).cuda()
        for p in tree_lstm.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda()
        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        wandb.watch((model, tree_lstm))
        print(wandb.config)
        tree = run_train(train_iter, valid_iter, model, tree_lstm, V)
    else:
        print("loading")
        model, tree_lstm = torch.load("cp.yoyo.model")
        print(valid_sup(valid_iter, model, tree_lstm, V))
Example #15
0
 def test_subword_trec(self):
     TEXT = data.SubwordField()
     LABEL = data.Field(sequential=False)
     RAW = data.Field(sequential=False, use_vocab=False)
     raw, _ = TREC.splits(RAW, LABEL)
     cooked, _ = TREC.splits(TEXT, LABEL)
     LABEL.build_vocab(cooked)
     TEXT.build_vocab(cooked, max_size=100)
     TEXT.segment(cooked)
     print(cooked[0].text)
     batch = next(iter(data.Iterator(cooked, 1, shuffle=False)))
     self.assertEqual(TEXT.reverse(batch.text.data)[0], raw[0].text)
Example #16
0
    def test_tabular_simple_data(self):
        for data_format in ["csv", "tsv", "json"]:
            self.write_test_ppid_dataset(data_format=data_format)

            if data_format == "json":
                question_field = data.Field(sequential=True)
                label_field = data.Field(sequential=False)
                fields = {
                    "question1": ("q1", question_field),
                    "question2": ("q2", question_field),
                    "label": ("label", label_field)
                }
            else:
                question_field = data.Field(sequential=True)
                label_field = data.Field(sequential=False)
                fields = [("id", None), ("q1", question_field),
                          ("q2", question_field), ("label", label_field)]

            dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format=data_format,
                                          fields=fields)

            assert len(dataset) == 3

            expected_examples = [
                (["When", "do", "you", "use", "シ", "instead", "of", "し?"], [
                    "When", "do", "you", "use", "\"&\"", "instead", "of",
                    "\"and\"?"
                ], "0"),
                (["Where", "was", "Lincoln", "born?"],
                 ["Which", "location", "was", "Abraham", "Lincoln",
                  "born?"], "1"), (["What", "is", "2+2"], ["2+2=?"], "1")
            ]

            # Ensure examples have correct contents / test __getitem__
            for i in range(len(dataset)):
                self.assertEqual(dataset[i].q1, expected_examples[i][0])
                self.assertEqual(dataset[i].q2, expected_examples[i][1])
                self.assertEqual(dataset[i].label, expected_examples[i][2])

            # Test __getattr__
            for i, (q1, q2, label) in enumerate(
                    zip(dataset.q1, dataset.q2, dataset.label)):
                self.assertEqual(q1, expected_examples[i][0])
                self.assertEqual(q2, expected_examples[i][1])
                self.assertEqual(label, expected_examples[i][2])

            # Test __iter__
            for i, example in enumerate(dataset):
                self.assertEqual(example.q1, expected_examples[i][0])
                self.assertEqual(example.q2, expected_examples[i][1])
                self.assertEqual(example.label, expected_examples[i][2])
def main(config):
    saved_data = torch.load(config.model_fn,
                            map_location='cpu' if config.gpu_id < 0 else
                            'cuda:{}'.format(config.gpu_id))

    model_dict = saved_data['model']
    train_config = saved_data['config']
    vocab = saved_data['vocab']
    label = saved_data['label']

    text_field = data.Field(batch_first=True, unk_token='<unk>')
    label_field = data.Field(sequential=False)

    text_field.vocab = vocab
    label_field.vocab = label

    lines = open_file(train_config)

    with torch.no_grad():
        model = DisasterClassifier(input_size=len(vocab),
                                   embedding_dim=train_config.embedding_dim,
                                   num_layers=train_config.num_layers,
                                   hidden_size=train_config.hidden_size,
                                   dropout=train_config.dropout,
                                   n_classes=len(label))
        model.load_state_dict(model_dict)

        model.eval()

        y_hat = []
        for i in range(0, len(lines), config.batch_size):
            x = text_field.numericalize(
                text_field.pad(lines[i:i + config.batch_size]),
                device='cpu'
                if config.gpu_id < 0 else 'cuda:{}'.format(config.gpu_id))
            y_hat.append(model(x).cpu())

        y_hat = torch.cat(y_hat, dim=0)

        probs, indices = torch.topk(y_hat, config.top_k, dim=-1)

        with open('{}_prediction.tsv'.format(config.model_fn[:-4]),
                  'w',
                  -1,
                  encoding='utf-8') as f:
            for i in range(len(lines)):
                f.write('{}\t{}\n'.format(
                    ' '.join(label.itos[indices[i][j]]
                             for j in range(config.top_k)),
                    ' '.join(lines[i])))
 def _preprocess_splits(self, h5py_file: h5py.File):
     TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
     LABEL = data.Field(sequential=False)
     with tempfile.TemporaryDirectory() as tmpdirname:
         train_set, test_set = datasets.trec.TREC.splits(TEXT,
                                                         LABEL,
                                                         root=tmpdirname,
                                                         fine_grained=True)
         self._preprocess_split(h5py_file,
                                split_name="train",
                                dataset_split=train_set)
         self._preprocess_split(h5py_file,
                                split_name="test",
                                dataset_split=test_set)
Example #19
0
    def __init__(self,
                 max_vocab=9999,
                 min_freq=1,
                 init_token='<bos>',
                 eos_token='<eos>'):
        super().__init__()

        self.max_vocab = max_vocab
        self.min_freq = min_freq
        self.label = data.Field(sequential=False, unk_token=None)
        self.text = data.Field(
            init_token=init_token,
            eos_token=eos_token,
            batch_first=True,
        )
Example #20
0
def filter_init(ex_val1, ex_val2, ex_val3):
    text_field = data.Field(sequential=True)
    label_field = data.Field(sequential=False)
    fields = [("text1", text_field), ("text2", text_field),
              ("label", label_field)]

    example1 = data.Example.fromlist(ex_val1, fields)
    example2 = data.Example.fromlist(ex_val2, fields)
    example3 = data.Example.fromlist(ex_val3, fields)
    examples = [example1, example2, example3]

    dataset = data.Dataset(examples, fields)
    text_field.build_vocab(dataset)

    return dataset, text_field
Example #21
0
def get_essentials(train_df, max_seq_length=128, train_batch_size=16):

    X, y = train_df.iloc[:, 0].values, train_df.iloc[:, 1].values

    text_field = data.Field()
    text_field.build_vocab(X, max_size=10000)

    X_split = [t.split() for t in X]

    # pad
    X_pad = [pad(s, max_seq_length) for s in X_split]

    # to index
    X_index = [to_indexes(text_field.vocab, s) for s in X_pad]

    train_dataset = to_dataset(X_index, y)

    train_sampler = SequentialSampler(train_dataset)
    train_loader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=train_batch_size,
        drop_last=True,
    )

    return text_field, train_loader
Example #22
0
    def test_numericalize_include_lengths(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, include_lengths=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        test_example_lengths = [8, 3, 7]

        # Test with include_lengths
        include_lengths_numericalized = question_field.numericalize(
            (test_example_data, test_example_lengths))
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     include_lengths_numericalized,
                                     test_example_lengths)
Example #23
0
    def test_numericalize_postprocessing(self):
        self.write_test_ppid_dataset(data_format="tsv")

        def reverse_postprocess(arr, vocab):
            return [list(reversed(sentence)) for sentence in arr]

        question_field = data.Field(sequential=True,
                                    postprocessing=reverse_postprocess)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]

        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]
        reversed_test_example_data = [list(reversed(sentence)) for sentence in
                                      test_example_data]

        postprocessed_numericalized = question_field.numericalize(
            (test_example_data))
        verify_numericalized_example(question_field,
                                     reversed_test_example_data,
                                     postprocessed_numericalized)
Example #24
0
    def test_numericalize_stop_words(self):
        # Based on request from #354
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True, batch_first=True,
                                    stop_words=set(["do", "you"]))
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)
        question_field.build_vocab(tsv_dataset)

        test_example_data = question_field.pad(
            [question_field.preprocess(x) for x in
             [["When", "do", "you", "use", "シ",
               "instead", "of", "し?"],
              ["What", "is", "2+2", "<pad>", "<pad>",
               "<pad>", "<pad>", "<pad>"],
              ["Here", "is", "a", "sentence", "with",
               "some", "oovs", "<pad>"]]]
        )

        # Test with batch_first
        stopwords_removed_numericalized = question_field.numericalize(test_example_data)
        verify_numericalized_example(question_field,
                                     test_example_data,
                                     stopwords_removed_numericalized,
                                     batch_first=True)
Example #25
0
    def test_serialization_built_vocab(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        question_field.build_vocab(tsv_dataset)

        question_pickle_filename = "question.pl"
        question_pickle_path = os.path.join(self.test_dir, question_pickle_filename)
        torch.save(question_field, question_pickle_path)

        loaded_question_field = torch.load(question_pickle_path)

        assert loaded_question_field == question_field

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test results of numericalization
        original_numericalization = question_field.numericalize(test_example_data)
        pickled_numericalization = loaded_question_field.numericalize(test_example_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Example #26
0
    def test_preprocess(self):
        nesting_field = data.Field(
            tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs])
        field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs))
        preprocessed = field.preprocess("john loves mary")

        assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
Example #27
0
    def test_errors(self):
        # Ensure that trying to retrieve a key not in JSON data errors
        self.write_test_ppid_dataset(data_format="json")

        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)
        fields = {
            "qeustion1": ("q1", question_field),
            "question2": ("q2", question_field),
            "label": ("label", label_field)
        }

        with self.assertRaises(ValueError):
            data.TabularDataset(path=self.test_ppid_dataset_path,
                                format="json",
                                fields=fields)
Example #28
0
    def test_serialization(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        field_pickle_filename = "char_field.pl"
        field_pickle_path = os.path.join(self.test_dir, field_pickle_filename)
        torch.save(field, field_pickle_path)

        loaded_field = torch.load(field_pickle_path)
        assert loaded_field == field

        original_numericalization = field.numericalize(examples_data)
        pickled_numericalization = loaded_field.numericalize(examples_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Example #29
0
    def __init__(self, config, w2v_stoi, w2v_vectors, device):
        self.config = config
        self.w2v_stoi = w2v_stoi
        self.w2v_vectors = w2v_vectors
        print(self.w2v_vectors.shape)
        self.device = device
        self.SRC = data.Field(tokenize=lambda x: x.split(),
                              unk_token='<unk>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=True)
        self.TRG = data.Field(
            tokenize=lambda x: x.split(),
            unk_token='<unk>',
            pad_token='<pad>',
            lower=True,
            batch_first=True,
        )
        self.train_data = TranslationDataset(
            path='dataset/klue-ner-v1_train_cleaned_tokenized',
            exts=('.src', '.trg'),
            fields=(self.SRC, self.TRG))
        self.test_data = TranslationDataset(
            path='dataset/klue-ner-v1_dev_cleaned_tokenized',
            exts=('.src', '.trg'),
            fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator = data.BucketIterator(
            self.train_data,
            batch_size=self.config['batch_size'],
            device=device,
            sort_key=lambda x: len(x.src),
            sort_within_batch=True)
        self.test_iterator = data.BucketIterator(
            self.test_data,
            batch_size=self.config['batch_size'],
            device=device,
            sort_key=lambda x: len(x.src),
            sort_within_batch=True)
Example #30
0
    def test_csv_file_with_header(self):
        example_with_header = [("text", "label"), ("HELLO WORLD", "0"),
                               ("goodbye world", "1")]

        TEXT = data.Field(lower=True, tokenize=lambda x: x.split())
        fields = {
            "label": ("label", data.Field(use_vocab=False, sequential=False)),
            "text": ("text", TEXT)
        }

        for format_, delim in zip(["csv", "tsv"], [",", "\t"]):
            with open(self.test_has_header_dataset_path, "wt") as f:
                for line in example_with_header:
                    f.write("{}\n".format(delim.join(line)))

            # check that an error is raised here if a non-existent field is specified
            with self.assertRaises(ValueError):
                data.TabularDataset(
                    path=self.test_has_header_dataset_path,
                    format=format_,
                    fields={"non_existent": ("label", data.Field())})

            dataset = data.TabularDataset(
                path=self.test_has_header_dataset_path,
                format=format_,
                skip_header=False,
                fields=fields)

            TEXT.build_vocab(dataset)

            for i, example in enumerate(dataset):
                self.assertEqual(example.text,
                                 example_with_header[i + 1][0].lower().split())
                self.assertEqual(example.label, example_with_header[i + 1][1])

            # check that the vocabulary is built correctly (#225)
            expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0}
            for k, v in expected_freqs.items():
                self.assertEqual(TEXT.vocab.freqs[k], v)

            data_iter = data.Iterator(dataset,
                                      batch_size=1,
                                      sort_within_batch=False,
                                      repeat=False)
            next(data_iter.__iter__())