def __init__(self, config, tokenize_src, tokenize_trg, device): self.config = config self.device = device self.SRC = data.Field(tokenize=tokenize_src, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True) self.TRG = data.Field( tokenize=tokenize_trg, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True, ) self.train_data, self.valid_data, self.test_data = Multi30k.splits( exts=(config['src_ext'], config['trg_ext']), fields=(self.SRC, self.TRG)) self.build_vocab() print('number of training data : {}'.format(len(self.train_data))) print('number of valid data : {}'.format(len(self.valid_data))) print('number of test data : {}'.format(len(self.test_data))) self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits( (self.train_data, self.valid_data, self.test_data), batch_size=self.config['batch_size'], device=self.device)
def load_line_as_data(self, line, level, lowercase, src_vocab, trg_vocab): tok_fun = lambda s: list(s) if level == "char" else s.split() src_field = data.Field( init_token=None, eos_token=EOS_TOKEN, # FIXME pad_token=PAD_TOKEN, tokenize=tok_fun, batch_first=True, lower=lowercase, unk_token=UNK_TOKEN, include_lengths=True) trg_field = data.Field(init_token=BOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, tokenize=tok_fun, unk_token=UNK_TOKEN, batch_first=True, lower=lowercase, include_lengths=True) test_data = MonoLineDataset(line=line, field=src_field) src_field.vocab = src_vocab trg_field.vocab = trg_vocab return test_data, src_vocab, trg_vocab
def get_data(): TEXT = data.Field(lower=True) UD_TAGS = data.Field(unk_token=None) PTB_TAGS = data.Field(unk_token=None) fields = (("text", TEXT), ("udtags", UD_TAGS), ("ptbtags", PTB_TAGS)) train_data, valid_data, test_data = datasets.UDPOS.splits(fields) print(f"Number of training examples: {len(train_data)}") print(f"Number of validation examples: {len(valid_data)}") print(f"Number of testing examples: {len(test_data)}") MIN_FREQ = 2 TEXT.build_vocab(train_data, min_freq=MIN_FREQ, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_) UD_TAGS.build_vocab(train_data) PTB_TAGS.build_vocab(train_data) print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}") print(f"Unique tokens in UD_TAG vocabulary: {len(UD_TAGS.vocab)}") print(f"Unique tokens in PTB_TAG vocabulary: {len(PTB_TAGS.vocab)}") return TEXT, PTB_TAGS, train_data, test_data, valid_data
def test_csv_dataset_quotechar(self): # Based on issue #349 example_data = [("text", "label"), ('" hello world', "0"), ('goodbye " world', "1"), ('this is a pen " ', "0")] with tempfile.NamedTemporaryFile(dir=self.test_dir) as f: for example in example_data: f.write("{}\n".format(",".join(example)).encode("latin-1")) TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) fields = { "label": ("label", data.Field(use_vocab=False, sequential=False)), "text": ("text", TEXT) } f.seek(0) dataset = data.TabularDataset( path=f.name, format="csv", skip_header=False, fields=fields, csv_reader_params={"quotechar": None}) TEXT.build_vocab(dataset) self.assertEqual(len(dataset), len(example_data) - 1) for i, example in enumerate(dataset): self.assertEqual(example.text, example_data[i + 1][0].lower().split()) self.assertEqual(example.label, example_data[i + 1][1])
def __init__(self, config, filepath, tokenize, device): self.config = config self.device = device self.SRC = data.Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True) self.TRG = data.Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', pad_token='<pad>', lower=True, batch_first=True) self.train_data, self.valid_data, self.test_data = \ datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'), fields=(self.SRC, self.TRG)) self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits( (self.train_data, self.valid_data, self.test_data), batch_size=self.config['batch_size'], device=self.device) self.build_vocab()
def test_pad_when_fix_length_is_not_none(self): nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField( nesting_field, init_token="<s>", eos_token="</s>", fix_length=3) minibatch = [ ["john", "loves", "mary"], ["mary", "cries"] ] expected = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ] ] assert CHARS.pad(minibatch) == expected # test include length nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>", init_token="<w>", eos_token="</w>") CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>", include_lengths=True, fix_length=3) arr, seq_len, words_len = CHARS.pad(minibatch) assert arr == expected assert seq_len == [3, 3] assert words_len == [[3, 6, 3], [3, 6, 3]]
def test_json_valid_and_invalid_nested_key(self): self.write_test_nested_key_json_dataset() valid_fields = { 'foods.vegetables.name': ('vegs', data.Field()), 'foods.fruits': ('fruits', data.Field()) } invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())} expected_examples = [{ "fruits": ["Apple", "Banana"], "vegs": ["Broccoli", "Cabbage"] }, { "fruits": ["Cherry", "Grape", "Lemon"], "vegs": ["Cucumber", "Lettuce"] }, { "fruits": ["Orange", "Pear", "Strawberry"], "vegs": ["Marrow", "Spinach"] }] dataset = data.TabularDataset( path=self.test_nested_key_json_dataset_path, format="json", fields=valid_fields) # check results for example, expect in zip(dataset.examples, expected_examples): self.assertEqual(example.vegs, expect['vegs']) self.assertEqual(example.fruits, expect['fruits']) with self.assertRaises(ValueError): data.TabularDataset(path=self.test_nested_key_json_dataset_path, format="json", fields=invalid_fields)
def create_fields(opt): spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl'] if opt.src_lang not in spacy_langs: print('invalid src language: ' + opt.src_lang + 'supported languages : ' + spacy_langs) if opt.trg_lang not in spacy_langs: print('invalid trg language: ' + opt.trg_lang + 'supported languages : ' + spacy_langs) print("loading spacy tokenizers...") t_src = tokenize(opt.src_lang) t_trg = tokenize(opt.trg_lang) TRG = data.Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>') SRC = data.Field(lower=True, tokenize=t_src.tokenizer) if opt.load_weights is not None: try: print("loading presaved fields...") SRC = pickle.load(open(f'{opt.load_weights}/SRC.pkl', 'rb')) TRG = pickle.load(open(f'{opt.load_weights}/TRG.pkl', 'rb')) except: print("error opening SRC.pkl and TXT.pkl field files, please ensure they are in " + opt.load_weights + "/") quit() return(SRC, TRG)
def __init__(self, max_size=999999, min_freq=1): super().__init__() self.label = data.Field(sequential=False) self.text = data.Field(batch_first=True, unk_token='<unk>') self.max_size = max_size self.min_freq = min_freq
def get_files(path, train_size, max_doc_len, seed, tokenizer): # including lengths makes the text var a tuple containing the tweet and its length Text = data.Field(preprocessing=tweet_cleanup, tokenize=tokenizer, batch_first=True, include_lengths=True, fix_length=max_doc_len, lower=True) Label = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None) fields = [('text', Text), ('labels', Label)] # builds a pytorch dataset from the given training and testing files train_data, test_data = data.TabularDataset.splits( path=path, train='../data/train_bin_labels.csv', test='../data/test_bin_labels.csv', format='csv', fields=fields, skip_header=True) train_data, val_data = train_data.split(split_ratio=train_size, random_state=random.seed(seed)) print(f'Number of training examples: {len(train_data)}') print(f'Number of validation examples: {len(val_data)}') print(f'Number of testing examples: {len(test_data)}') return train_data, val_data, test_data, Text, Label
def test_batch_iter(self): self.write_test_numerical_features_dataset() FLOAT = data.Field(use_vocab=False, sequential=False, dtype=torch.float) INT = data.Field(use_vocab=False, sequential=False, is_target=True) TEXT = data.Field(sequential=False) dst = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", skip_header=False, fields=[("float", FLOAT), ("int", INT), ("text", TEXT)]) TEXT.build_vocab(dst) itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False) fld_order = [ k for k, v in dst.fields.items() if v is not None and not v.is_target ] batch = next(iter(itr)) (x1, x2), y = batch x = (x1, x2)[fld_order.index("float")] self.assertEquals(y.data[0], 1) self.assertEquals(y.data[1], 12) self.assertAlmostEqual(x.data[0], 0.1, places=4) self.assertAlmostEqual(x.data[1], 0.5, places=4)
def __init__(self, config, device): self.config = config self.device = device # corpus_separator(filepath) self.title = data.Field(tokenize=lambda x: x.split(' '), lower=True, batch_first=True, include_lengths=True) self.label = data.Field(lower=True, batch_first=True) fields = [('label', self.label), ('title', self.title)] self.train_data, self.valid_data, self.test_data = data.TabularDataset.splits( path=self.config['cls_dir_path'], train='train_tokenized.ynat', validation='val_tokenized.ynat', test='test_tokenized.ynat', format='tsv', fields=fields) self.build_vocab() print('number of training data : {}'.format(len(self.train_data))) print('number of valid data : {}'.format(len(self.valid_data))) print('number of test data : {}'.format(len(self.test_data))) self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits( (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True, batch_size=self.config['cls_batch_size'], device=self.device, sort_key=lambda x: len(x.title))
def __init__(self) -> None: self.SRC = data.Field(pad_token=Constants.PAD, unk_token=Constants.UNK, batch_first=True) self.TGT = data.Field(init_token=Constants.START, eos_token=Constants.END, pad_token=Constants.PAD, unk_token=Constants.UNK, batch_first=True)
def main(): global WORD WORD = data.Field(include_lengths=True, batch_first=True, eos_token=None, init_token=None) LABEL = data.Field(sequential=False, batch_first=True) TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD)) TREE.is_target = False train = ListOpsDataset( "data/train_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < config["train_len"], ) WORD.build_vocab(train) LABEL.build_vocab(train) valid = ListOpsDataset( "data/test_d20s.tsv", (("word", WORD), ("label", LABEL), ("tree", TREE)), filter_pred=lambda x: 5 < len(x.word) < 150, ) train_iter = TokenBucket(train, batch_size=1500, device="cuda:0", key=lambda x: len(x.word)) train_iter.repeat = False valid_iter = data.BucketIterator(train, batch_size=50, train=False, sort=False, device="cuda:0") NT = 1 T = len(WORD.vocab) V = T if True: tree_lstm = TreeLSTM(config["H"], len(WORD.vocab) + 100, len(LABEL.vocab)).cuda() for p in tree_lstm.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda() for p in model.parameters(): if p.dim() > 1: torch.nn.init.xavier_uniform_(p) wandb.watch((model, tree_lstm)) print(wandb.config) tree = run_train(train_iter, valid_iter, model, tree_lstm, V) else: print("loading") model, tree_lstm = torch.load("cp.yoyo.model") print(valid_sup(valid_iter, model, tree_lstm, V))
def test_subword_trec(self): TEXT = data.SubwordField() LABEL = data.Field(sequential=False) RAW = data.Field(sequential=False, use_vocab=False) raw, _ = TREC.splits(RAW, LABEL) cooked, _ = TREC.splits(TEXT, LABEL) LABEL.build_vocab(cooked) TEXT.build_vocab(cooked, max_size=100) TEXT.segment(cooked) print(cooked[0].text) batch = next(iter(data.Iterator(cooked, 1, shuffle=False))) self.assertEqual(TEXT.reverse(batch.text.data)[0], raw[0].text)
def test_tabular_simple_data(self): for data_format in ["csv", "tsv", "json"]: self.write_test_ppid_dataset(data_format=data_format) if data_format == "json": question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = { "question1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } else: question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format=data_format, fields=fields) assert len(dataset) == 3 expected_examples = [ (["When", "do", "you", "use", "シ", "instead", "of", "し?"], [ "When", "do", "you", "use", "\"&\"", "instead", "of", "\"and\"?" ], "0"), (["Where", "was", "Lincoln", "born?"], ["Which", "location", "was", "Abraham", "Lincoln", "born?"], "1"), (["What", "is", "2+2"], ["2+2=?"], "1") ] # Ensure examples have correct contents / test __getitem__ for i in range(len(dataset)): self.assertEqual(dataset[i].q1, expected_examples[i][0]) self.assertEqual(dataset[i].q2, expected_examples[i][1]) self.assertEqual(dataset[i].label, expected_examples[i][2]) # Test __getattr__ for i, (q1, q2, label) in enumerate( zip(dataset.q1, dataset.q2, dataset.label)): self.assertEqual(q1, expected_examples[i][0]) self.assertEqual(q2, expected_examples[i][1]) self.assertEqual(label, expected_examples[i][2]) # Test __iter__ for i, example in enumerate(dataset): self.assertEqual(example.q1, expected_examples[i][0]) self.assertEqual(example.q2, expected_examples[i][1]) self.assertEqual(example.label, expected_examples[i][2])
def main(config): saved_data = torch.load(config.model_fn, map_location='cpu' if config.gpu_id < 0 else 'cuda:{}'.format(config.gpu_id)) model_dict = saved_data['model'] train_config = saved_data['config'] vocab = saved_data['vocab'] label = saved_data['label'] text_field = data.Field(batch_first=True, unk_token='<unk>') label_field = data.Field(sequential=False) text_field.vocab = vocab label_field.vocab = label lines = open_file(train_config) with torch.no_grad(): model = DisasterClassifier(input_size=len(vocab), embedding_dim=train_config.embedding_dim, num_layers=train_config.num_layers, hidden_size=train_config.hidden_size, dropout=train_config.dropout, n_classes=len(label)) model.load_state_dict(model_dict) model.eval() y_hat = [] for i in range(0, len(lines), config.batch_size): x = text_field.numericalize( text_field.pad(lines[i:i + config.batch_size]), device='cpu' if config.gpu_id < 0 else 'cuda:{}'.format(config.gpu_id)) y_hat.append(model(x).cpu()) y_hat = torch.cat(y_hat, dim=0) probs, indices = torch.topk(y_hat, config.top_k, dim=-1) with open('{}_prediction.tsv'.format(config.model_fn[:-4]), 'w', -1, encoding='utf-8') as f: for i in range(len(lines)): f.write('{}\t{}\n'.format( ' '.join(label.itos[indices[i][j]] for j in range(config.top_k)), ' '.join(lines[i])))
def _preprocess_splits(self, h5py_file: h5py.File): TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) LABEL = data.Field(sequential=False) with tempfile.TemporaryDirectory() as tmpdirname: train_set, test_set = datasets.trec.TREC.splits(TEXT, LABEL, root=tmpdirname, fine_grained=True) self._preprocess_split(h5py_file, split_name="train", dataset_split=train_set) self._preprocess_split(h5py_file, split_name="test", dataset_split=test_set)
def __init__(self, max_vocab=9999, min_freq=1, init_token='<bos>', eos_token='<eos>'): super().__init__() self.max_vocab = max_vocab self.min_freq = min_freq self.label = data.Field(sequential=False, unk_token=None) self.text = data.Field( init_token=init_token, eos_token=eos_token, batch_first=True, )
def filter_init(ex_val1, ex_val2, ex_val3): text_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = [("text1", text_field), ("text2", text_field), ("label", label_field)] example1 = data.Example.fromlist(ex_val1, fields) example2 = data.Example.fromlist(ex_val2, fields) example3 = data.Example.fromlist(ex_val3, fields) examples = [example1, example2, example3] dataset = data.Dataset(examples, fields) text_field.build_vocab(dataset) return dataset, text_field
def get_essentials(train_df, max_seq_length=128, train_batch_size=16): X, y = train_df.iloc[:, 0].values, train_df.iloc[:, 1].values text_field = data.Field() text_field.build_vocab(X, max_size=10000) X_split = [t.split() for t in X] # pad X_pad = [pad(s, max_seq_length) for s in X_split] # to index X_index = [to_indexes(text_field.vocab, s) for s in X_pad] train_dataset = to_dataset(X_index, y) train_sampler = SequentialSampler(train_dataset) train_loader = DataLoader( train_dataset, sampler=train_sampler, batch_size=train_batch_size, drop_last=True, ) return text_field, train_loader
def test_numericalize_include_lengths(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] test_example_lengths = [8, 3, 7] # Test with include_lengths include_lengths_numericalized = question_field.numericalize( (test_example_data, test_example_lengths)) verify_numericalized_example(question_field, test_example_data, include_lengths_numericalized, test_example_lengths)
def test_numericalize_postprocessing(self): self.write_test_ppid_dataset(data_format="tsv") def reverse_postprocess(arr, vocab): return [list(reversed(sentence)) for sentence in arr] question_field = data.Field(sequential=True, postprocessing=reverse_postprocess) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] reversed_test_example_data = [list(reversed(sentence)) for sentence in test_example_data] postprocessed_numericalized = question_field.numericalize( (test_example_data)) verify_numericalized_example(question_field, reversed_test_example_data, postprocessed_numericalized)
def test_numericalize_stop_words(self): # Based on request from #354 self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, batch_first=True, stop_words=set(["do", "you"])) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = question_field.pad( [question_field.preprocess(x) for x in [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]]] ) # Test with batch_first stopwords_removed_numericalized = question_field.numericalize(test_example_data) verify_numericalized_example(question_field, test_example_data, stopwords_removed_numericalized, batch_first=True)
def test_serialization_built_vocab(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) question_pickle_filename = "question.pl" question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) torch.save(question_field, question_pickle_path) loaded_question_field = torch.load(question_pickle_path) assert loaded_question_field == question_field test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test results of numericalization original_numericalization = question_field.numericalize(test_example_data) pickled_numericalization = loaded_question_field.numericalize(test_example_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def test_preprocess(self): nesting_field = data.Field( tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs]) field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs)) preprocessed = field.preprocess("john loves mary") assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
def test_errors(self): # Ensure that trying to retrieve a key not in JSON data errors self.write_test_ppid_dataset(data_format="json") question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = { "qeustion1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } with self.assertRaises(ValueError): data.TabularDataset(path=self.test_ppid_dataset_path, format="json", fields=fields)
def test_serialization(self): nesting_field = data.Field(batch_first=True) field = data.NestedField(nesting_field) ex1 = data.Example.fromlist(["john loves mary"], [("words", field)]) ex2 = data.Example.fromlist(["mary cries"], [("words", field)]) dataset = data.Dataset([ex1, ex2], [("words", field)]) field.build_vocab(dataset) examples_data = [ [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("john") + ["</w>", "<cpad>"], ["<w>"] + list("loves") + ["</w>"], ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ], [ ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4, ["<w>"] + list("mary") + ["</w>", "<cpad>"], ["<w>"] + list("cries") + ["</w>"], ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4, ["<cpad>"] * 7, ] ] field_pickle_filename = "char_field.pl" field_pickle_path = os.path.join(self.test_dir, field_pickle_filename) torch.save(field, field_pickle_path) loaded_field = torch.load(field_pickle_path) assert loaded_field == field original_numericalization = field.numericalize(examples_data) pickled_numericalization = loaded_field.numericalize(examples_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def __init__(self, config, w2v_stoi, w2v_vectors, device): self.config = config self.w2v_stoi = w2v_stoi self.w2v_vectors = w2v_vectors print(self.w2v_vectors.shape) self.device = device self.SRC = data.Field(tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', lower=True, batch_first=True, include_lengths=True) self.TRG = data.Field( tokenize=lambda x: x.split(), unk_token='<unk>', pad_token='<pad>', lower=True, batch_first=True, ) self.train_data = TranslationDataset( path='dataset/klue-ner-v1_train_cleaned_tokenized', exts=('.src', '.trg'), fields=(self.SRC, self.TRG)) self.test_data = TranslationDataset( path='dataset/klue-ner-v1_dev_cleaned_tokenized', exts=('.src', '.trg'), fields=(self.SRC, self.TRG)) self.build_vocab() print('number of training data : {}'.format(len(self.train_data))) print('number of test data : {}'.format(len(self.test_data))) self.train_iterator = data.BucketIterator( self.train_data, batch_size=self.config['batch_size'], device=device, sort_key=lambda x: len(x.src), sort_within_batch=True) self.test_iterator = data.BucketIterator( self.test_data, batch_size=self.config['batch_size'], device=device, sort_key=lambda x: len(x.src), sort_within_batch=True)
def test_csv_file_with_header(self): example_with_header = [("text", "label"), ("HELLO WORLD", "0"), ("goodbye world", "1")] TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) fields = { "label": ("label", data.Field(use_vocab=False, sequential=False)), "text": ("text", TEXT) } for format_, delim in zip(["csv", "tsv"], [",", "\t"]): with open(self.test_has_header_dataset_path, "wt") as f: for line in example_with_header: f.write("{}\n".format(delim.join(line))) # check that an error is raised here if a non-existent field is specified with self.assertRaises(ValueError): data.TabularDataset( path=self.test_has_header_dataset_path, format=format_, fields={"non_existent": ("label", data.Field())}) dataset = data.TabularDataset( path=self.test_has_header_dataset_path, format=format_, skip_header=False, fields=fields) TEXT.build_vocab(dataset) for i, example in enumerate(dataset): self.assertEqual(example.text, example_with_header[i + 1][0].lower().split()) self.assertEqual(example.label, example_with_header[i + 1][1]) # check that the vocabulary is built correctly (#225) expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0} for k, v in expected_freqs.items(): self.assertEqual(TEXT.vocab.freqs[k], v) data_iter = data.Iterator(dataset, batch_size=1, sort_within_batch=False, repeat=False) next(data_iter.__iter__())