def test_json_valid_and_invalid_nested_key(self): self.write_test_nested_key_json_dataset() valid_fields = { 'foods.vegetables.name': ('vegs', data.Field()), 'foods.fruits': ('fruits', data.Field()) } invalid_fields = {'foods.vegetables.color': ('vegs', data.Field())} expected_examples = [{ "fruits": ["Apple", "Banana"], "vegs": ["Broccoli", "Cabbage"] }, { "fruits": ["Cherry", "Grape", "Lemon"], "vegs": ["Cucumber", "Lettuce"] }, { "fruits": ["Orange", "Pear", "Strawberry"], "vegs": ["Marrow", "Spinach"] }] dataset = data.TabularDataset( path=self.test_nested_key_json_dataset_path, format="json", fields=valid_fields) # check results for example, expect in zip(dataset.examples, expected_examples): self.assertEqual(example.vegs, expect['vegs']) self.assertEqual(example.fruits, expect['fruits']) with self.assertRaises(ValueError): data.TabularDataset(path=self.test_nested_key_json_dataset_path, format="json", fields=invalid_fields)
def test_serialization_built_vocab(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) question_pickle_filename = "question.pl" question_pickle_path = os.path.join(self.test_dir, question_pickle_filename) torch.save(question_field, question_pickle_path) loaded_question_field = torch.load(question_pickle_path) assert loaded_question_field == question_field test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] # Test results of numericalization original_numericalization = question_field.numericalize(test_example_data) pickled_numericalization = loaded_question_field.numericalize(test_example_data) assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
def get_loaders(self, config): train, valid = data.TabularDataset( path=config.file_path, format='tsv', fields=[ ('label', self.label), ('text', self.text), ], ).split(split_ratio=config.train_ratio) self.train_loader, self.valid_loader = data.BucketIterator.splits( (train, valid), batch_size=config.batch_size, device='cuda:{}'.format(config.gpu_id) if config.gpu_id >= 0 else 'cpu', shuffle=True, sort_key=lambda x: len(x.text), sort_within_batch=True) self.label.build_vocab(train) self.text.build_vocab(train, max_size=self.max_vocab, min_freq=self.min_freq) return self.train_loader, self.valid_loader
def test_batch_iter(self): self.write_test_numerical_features_dataset() FLOAT = data.Field(use_vocab=False, sequential=False, dtype=torch.float) INT = data.Field(use_vocab=False, sequential=False, is_target=True) TEXT = data.Field(sequential=False) dst = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", skip_header=False, fields=[("float", FLOAT), ("int", INT), ("text", TEXT)]) TEXT.build_vocab(dst) itr = data.Iterator(dst, batch_size=2, device=-1, shuffle=False) fld_order = [ k for k, v in dst.fields.items() if v is not None and not v.is_target ] batch = next(iter(itr)) (x1, x2), y = batch x = (x1, x2)[fld_order.index("float")] self.assertEquals(y.data[0], 1) self.assertEquals(y.data[1], 12) self.assertAlmostEqual(x.data[0], 0.1, places=4) self.assertAlmostEqual(x.data[1], 0.5, places=4)
def test_numericalize_stop_words(self): # Based on request from #354 self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, batch_first=True, stop_words=set(["do", "you"])) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = question_field.pad( [question_field.preprocess(x) for x in [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]]] ) # Test with batch_first stopwords_removed_numericalized = question_field.numericalize(test_example_data) verify_numericalized_example(question_field, test_example_data, stopwords_removed_numericalized, batch_first=True)
def test_csv_dataset_quotechar(self): # Based on issue #349 example_data = [("text", "label"), ('" hello world', "0"), ('goodbye " world', "1"), ('this is a pen " ', "0")] with tempfile.NamedTemporaryFile(dir=self.test_dir) as f: for example in example_data: f.write("{}\n".format(",".join(example)).encode("latin-1")) TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) fields = { "label": ("label", data.Field(use_vocab=False, sequential=False)), "text": ("text", TEXT) } f.seek(0) dataset = data.TabularDataset( path=f.name, format="csv", skip_header=False, fields=fields, csv_reader_params={"quotechar": None}) TEXT.build_vocab(dataset) self.assertEqual(len(dataset), len(example_data) - 1) for i, example in enumerate(dataset): self.assertEqual(example.text, example_data[i + 1][0].lower().split()) self.assertEqual(example.label, example_data[i + 1][1])
def test_numericalize_include_lengths(self): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] test_example_lengths = [8, 3, 7] # Test with include_lengths include_lengths_numericalized = question_field.numericalize( (test_example_data, test_example_lengths)) verify_numericalized_example(question_field, test_example_data, include_lengths_numericalized, test_example_lengths)
def test_numericalize_postprocessing(self): self.write_test_ppid_dataset(data_format="tsv") def reverse_postprocess(arr, vocab): return [list(reversed(sentence)) for sentence in arr] question_field = data.Field(sequential=True, postprocessing=reverse_postprocess) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] reversed_test_example_data = [list(reversed(sentence)) for sentence in test_example_data] postprocessed_numericalized = question_field.numericalize( (test_example_data)) verify_numericalized_example(question_field, reversed_test_example_data, postprocessed_numericalized)
def test_csv_file_with_header(self): example_with_header = [("text", "label"), ("HELLO WORLD", "0"), ("goodbye world", "1")] TEXT = data.Field(lower=True, tokenize=lambda x: x.split()) fields = { "label": ("label", data.Field(use_vocab=False, sequential=False)), "text": ("text", TEXT) } for format_, delim in zip(["csv", "tsv"], [",", "\t"]): with open(self.test_has_header_dataset_path, "wt") as f: for line in example_with_header: f.write("{}\n".format(delim.join(line))) # check that an error is raised here if a non-existent field is specified with self.assertRaises(ValueError): data.TabularDataset( path=self.test_has_header_dataset_path, format=format_, fields={"non_existent": ("label", data.Field())}) dataset = data.TabularDataset( path=self.test_has_header_dataset_path, format=format_, skip_header=False, fields=fields) TEXT.build_vocab(dataset) for i, example in enumerate(dataset): self.assertEqual(example.text, example_with_header[i + 1][0].lower().split()) self.assertEqual(example.label, example_with_header[i + 1][1]) # check that the vocabulary is built correctly (#225) expected_freqs = {"hello": 1, "world": 2, "goodbye": 1, "text": 0} for k, v in expected_freqs.items(): self.assertEqual(TEXT.vocab.freqs[k], v) data_iter = data.Iterator(dataset, batch_size=1, sort_within_batch=False, repeat=False) next(data_iter.__iter__())
def load_dataset(config, train_pos='train.hh', train_neg='train.fb', dev_pos='dev.hh', dev_neg='dev.fb', test_pos='test.hh', test_neg='test.fb'): logger = logging.getLogger(__name__) root = config.data_path TEXT = data.Field(batch_first=True, eos_token='<eos>') dataset_fn = lambda name: data.TabularDataset( path=root + name, format='tsv', fields=[('text', TEXT)]) train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg]) dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg]) test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg]) TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq) if config.load_pretrained_embed: start = time.time() vectors = torchtext.vocab.GloVe('6B', dim=config.embed_size, cache=config.pretrained_embed_path) TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim) logger.info('vectors', TEXT.vocab.vectors.size()) logger.info('load embedding took {:.2f} s.'.format(time.time() - start)) vocab = TEXT.vocab dataiter_fn = lambda dataset, train: data.BucketIterator( dataset=dataset, batch_size=config.batch_size, shuffle=train, repeat=train, sort_key=lambda x: len(x.text), sort_within_batch=False, device=config.device) train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True), [train_pos_set, train_neg_set]) dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False), [dev_pos_set, dev_neg_set]) test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False), [test_pos_set, test_neg_set]) train_iters = DatasetIterator(train_pos_iter, train_neg_iter) dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter) test_iters = DatasetIterator(test_pos_iter, test_neg_iter) return train_iters, dev_iters, test_iters, vocab
def test_tabular_simple_data(self): for data_format in ["csv", "tsv", "json"]: self.write_test_ppid_dataset(data_format=data_format) if data_format == "json": question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = { "question1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } else: question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] dataset = data.TabularDataset(path=self.test_ppid_dataset_path, format=data_format, fields=fields) assert len(dataset) == 3 expected_examples = [ (["When", "do", "you", "use", "シ", "instead", "of", "し?"], [ "When", "do", "you", "use", "\"&\"", "instead", "of", "\"and\"?" ], "0"), (["Where", "was", "Lincoln", "born?"], ["Which", "location", "was", "Abraham", "Lincoln", "born?"], "1"), (["What", "is", "2+2"], ["2+2=?"], "1") ] # Ensure examples have correct contents / test __getitem__ for i in range(len(dataset)): self.assertEqual(dataset[i].q1, expected_examples[i][0]) self.assertEqual(dataset[i].q2, expected_examples[i][1]) self.assertEqual(dataset[i].label, expected_examples[i][2]) # Test __getattr__ for i, (q1, q2, label) in enumerate( zip(dataset.q1, dataset.q2, dataset.label)): self.assertEqual(q1, expected_examples[i][0]) self.assertEqual(q2, expected_examples[i][1]) self.assertEqual(label, expected_examples[i][2]) # Test __iter__ for i, example in enumerate(dataset): self.assertEqual(example.q1, expected_examples[i][0]) self.assertEqual(example.q2, expected_examples[i][1]) self.assertEqual(example.label, expected_examples[i][2])
def create_dataset(opt, SRC, TRG): print("creating dataset and iterator... ") raw_data = { 'src': [line for line in opt.src_data], 'trg': [line for line in opt.trg_data] } df = pd.DataFrame(raw_data, columns=["src", "trg"]) mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen) df = df.loc[mask] df.to_csv("translate_transformer_temp.csv", index=False) data_fields = [('src', SRC), ('trg', TRG)] train = data.TabularDataset('./translate_transformer_temp.csv', format='csv', fields=data_fields) train_iter = MyIterator(train, batch_size=opt.batchsize, device=opt.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True, shuffle=True) os.remove('translate_transformer_temp.csv') if opt.load_weights is None: SRC.build_vocab(train) TRG.build_vocab(train) if opt.checkpoint > 0: try: os.mkdir("weights") except: print( "weights folder already exists, run program with -load_weights weights to load them" ) quit() pickle.dump(SRC, open('weights/SRC.pkl', 'wb')) pickle.dump(TRG, open('weights/TRG.pkl', 'wb')) opt.src_pad = SRC.vocab.stoi['<pad>'] opt.trg_pad = TRG.vocab.stoi['<pad>'] opt.train_len = get_len(train_iter) return train_iter
def test_numerical_features_no_vocab(self): self.write_test_numerical_features_dataset() # Test basic usage int_field = data.Field(sequential=False, use_vocab=False) float_field = data.Field(sequential=False, use_vocab=False, dtype=torch.float) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) self.assertEqual(numericalized_int.data, [1, 0, 1, 3, 19]) numericalized_float = float_field.numericalize(test_float_data) self.assertEqual(numericalized_float.data, [1.1, 0.1, 3.91, 0.2, 10.2]) # Test with postprocessing applied int_field = data.Field(sequential=False, use_vocab=False, postprocessing=lambda arr, _: [x + 1 for x in arr]) float_field = data.Field(sequential=False, use_vocab=False, dtype=torch.float, postprocessing=lambda arr, _: [x * 0.5 for x in arr]) tsv_fields = [("int", int_field), ("float", float_field), ("string", None)] tsv_dataset = data.TabularDataset( path=self.test_numerical_features_dataset_path, format="tsv", fields=tsv_fields) int_field.build_vocab(tsv_dataset) float_field.build_vocab(tsv_dataset) test_int_data = ["1", "0", "1", "3", "19"] test_float_data = ["1.1", "0.1", "3.91", "0.2", "10.2"] numericalized_int = int_field.numericalize(test_int_data) self.assertEqual(numericalized_int.data, [2, 1, 2, 4, 20]) numericalized_float = float_field.numericalize(test_float_data) self.assertEqual(numericalized_float.data, [0.55, 0.05, 1.955, 0.1, 5.1])
def test_batch_with_missing_field(self): # smoke test to see if batches with missing attributes are shown properly with open(self.test_missing_field_dataset_path, "wt") as f: f.write("text,label\n1,0") dst = data.TabularDataset(path=self.test_missing_field_dataset_path, format="csv", skip_header=True, fields=[("text", data.Field(use_vocab=False, sequential=False)), ("label", None)]) itr = data.Iterator(dst, batch_size=64) str(next(itr.__iter__()))
def test_errors(self): # Ensure that trying to retrieve a key not in JSON data errors self.write_test_ppid_dataset(data_format="json") question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) fields = { "qeustion1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field) } with self.assertRaises(ValueError): data.TabularDataset(path=self.test_ppid_dataset_path, format="json", fields=fields)
def vocabulary_generator(): tweet = data.Field(sequential=True, tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True) target = data.Field(sequential=False, use_vocab=False) fields = {'Tweets': ('t', tweet), 'Target': ('s', target)} train_data = data.TabularDataset(path="./clean_train_csv.csv", format="csv", fields=fields) tweet.build_vocab(train_data, max_size=10000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_, min_freq=1) with open("./TEXT.Field", "wb") as f: dill.dump(tweet, f) with open("./TEST.Field", "wb") as f: dill.dump(target, f)
def test_intent(): config = tm.Config() text_field = data.Field(lower=True, tokenize=tokenize) label_field = data.Field(sequential=False) fields = [('text', text_field), ('label', label_field)] train_dataset, val_dataset = data.TabularDataset.splits( path='./', format='csv', skip_header=True, train=train_data_path, test=val_data_path, fields=fields) vectors = Vectors(name="./model/word2vec") text_field.build_vocab(train_dataset, val_dataset, min_freq=1, vectors=vectors) label_field.build_vocab(train_dataset, val_dataset) test_dataset = data.TabularDataset(path=test_data_path, format='csv', fields=fields, skip_header=True) test_iter = data.Iterator(test_dataset, batch_size=config.batch_size, sort_key=lambda x: len(x.text)) print('Loading model from {}...'.format(config.snapshot)) embed_num = len(text_field.vocab) class_num = len(label_field.vocab) - 1 kernel_sizes = [int(k) for k in config.kernel_sizes.split(',')] config.snapshot = './model/snapshot/best_steps_200.pt' cnn = tm.TextCnn(embed_num, config.embed_dim, class_num, config.kernel_num, kernel_sizes, config.dropout) cnn.load_state_dict(tm.torch.load(config.snapshot)) summary_predict(cnn, text_field, label_field)
def test_errors(self): # Test that passing a non-tuple (of data and length) to numericalize # with Field.include_lengths = True raises an error. with self.assertRaises(ValueError): self.write_test_ppid_dataset(data_format="tsv") question_field = data.Field(sequential=True, include_lengths=True) tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", None)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) question_field.build_vocab(tsv_dataset) test_example_data = [["When", "do", "you", "use", "シ", "instead", "of", "し?"], ["What", "is", "2+2", "<pad>", "<pad>", "<pad>", "<pad>", "<pad>"], ["Here", "is", "a", "sentence", "with", "some", "oovs", "<pad>"]] question_field.numericalize( test_example_data)
def test_input_with_newlines_in_text(self): # Smoke test for ensuring that TabularDataset works with files with newlines example_with_newlines = [("\"hello \n world\"", "1"), ("\"there is a \n newline\"", "0"), ("\"there is no newline\"", "1")] fields = [("text", data.Field(lower=True)), ("label", data.Field(sequential=False))] for delim in [",", "\t"]: with open(self.test_newline_dataset_path, "wt") as f: for line in example_with_newlines: f.write("{}\n".format(delim.join(line))) format_ = "csv" if delim == "," else "tsv" dataset = data.TabularDataset(path=self.test_newline_dataset_path, format=format_, fields=fields) # if the newline is not parsed correctly, this should raise an error for example in dataset: self.assertTrue(hasattr(example, "text")) self.assertTrue(hasattr(example, "label"))
def test_vocab_size(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.LabelField() # Copied from test_build_vocab with minor changes # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Skipping json dataset as we can rely on the original build vocab test label_field.build_vocab(tsv_dataset) assert label_field.vocab.freqs == Counter({'1': 2, '0': 1}) expected_stoi = {'1': 0, '0': 1} # No <unk> assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos
def generate_best_model(df_train, df_valid): tsv_train = pd.DataFrame() tsv_train['text'] = df_train['text'] tsv_train['label'] = df_train['class'] tsv_train['metadata'] = df_train['metadata'] tsv_train.to_csv('train.tsv', sep='\t', index=False) tsv_valid = pd.DataFrame() tsv_valid['text'] = df_valid['text'] tsv_valid['label'] = df_valid['class'] tsv_valid['metadata'] = df_valid['metadata'] tsv_valid.to_csv('valid.tsv', sep='\t', index=False) SEED = 1234 torch.manual_seed(SEED) torch.backends.cudnn.deterministic = True train_data = data.TabularDataset(path='train.tsv', format='tsv', fields=fields, skip_header=True) valid_data = data.TabularDataset(path='valid.tsv', format='tsv', fields=fields, skip_header=True) MAX_VOCAB_SIZE = 25_000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors=vectors, unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 32 train_iterator, valid_iterator = data.BucketIterator.splits( (train_data, valid_data), batch_size=BATCH_SIZE, device=device, sort=False) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 100 N_FILTERS = 100 FILTER_SIZES = [2, 3, 4] OUTPUT_DIM = len(LABEL.vocab) DROPOUT = 0.5 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss() model = model.to(device) criterion = criterion.to(device) def categorical_accuracy(preds, y): top_pred = preds.argmax(1, keepdim=True) correct = top_pred.eq(y.view_as(top_pred)).sum() acc = correct.float() / y.shape[0] return acc def train(model, iterator, optimizer, criterion): epoch_loss = 0 epoch_acc = 0 model.train() for batch in tqdm(iterator): optimizer.zero_grad() predictions = model(batch.text, batch.metadata) loss = criterion(predictions, batch.label) acc = categorical_accuracy(predictions, batch.label) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) def evaluate(model, iterator, criterion): epoch_loss = 0 epoch_acc = 0 model.eval() with torch.no_grad(): for batch in iterator: predictions = model(batch.text, batch.metadata) loss = criterion(predictions, batch.label) acc = categorical_accuracy(predictions, batch.label) epoch_loss += loss.item() epoch_acc += acc.item() return epoch_loss / len(iterator), epoch_acc / len(iterator) def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs N_EPOCHS = 10 best_valid_loss = float('inf') for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss torch.save(model.state_dict(), 'model.pt') print( f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%' ) model.load_state_dict(torch.load('model.pt')) return model, TEXT.vocab, LABEL.vocab.itos
def __init__( self, root, train_path, test_path, predict_path, batch_size=64, valid_ratio=.2, max_vocab=999999, min_freq=1, use_eos=False, shuffle=True, rm = re.compile('[:;\'\"\[\]\(\)\.,@]') #제거할 특수문자 ): super().__init__() # 전처리는 여기서 진행한다. # Data Field 정의 self.id = data.Field( # 학습에 쓰지 않을 column sequential=False, use_vocab=False, unk_token=None ) self.text = data.Field( use_vocab=True, tokenize=word_tokenize, batch_first=True, include_lengths=False, eos_token='<EOS>' if use_eos else None ) self.label = data.Field( sequential=False, # 0 or 1 use_vocab=False, unk_token=None, is_target=True ) # 데이터 읽어오기 # ratings_train.txt : train+valid train, valid = data.TabularDataset( path = root + train_path, format ='tsv', fields = [ ('id', self.id), ('text', self.text), ('label', self.label)], skip_header=True ).split(split_ratio=(1 - valid_ratio)) # ratings_test.txt : test test = data.TabularDataset( path = root + test_path, format='tsv', fields=[ ('id', self.id), ('text', self.text), ('label', self.label)], skip_header=True ) # ko_data.csv : Kaggle commit predict = data.TabularDataset( path = root + predict_path, format='csv', fields=[ ('id', self.id), ('text', self.text)], skip_header=True ) # Batchify (Dataloader에 올리기) # train+valid loader self.train_loader, self.valid_loader = data.BucketIterator.splits( (train, valid), batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=shuffle, sort_key=lambda x: len(x.text), # 길이로 sort 후 batch 나눔! sort_within_batch=True, # 미니 배치 내에서 sort ) # test_loader self.test_loader = data.BucketIterator( test, batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=False, ) # predict_loader self.predict_loader = data.BucketIterator( predict, batch_size=batch_size, device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'), shuffle=False ) self.label.build_vocab(train) self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq) # vocabulary set build
fpLabelP1Valid = fopOutputML + 'testP.label.p1.txt' fpLabelP2Valid = fopOutputML + 'testP.label.p2.txt' fpLabelP3Valid = fopOutputML + 'testP.label.p3.txt' # fpTest = fopRoot + 'test.csv' # fpTextTest = fopRoot + 'testW.text.txt' fpLabelP1Test = fopOutputML + 'testW.label.p1.txt' fpLabelP2Test = fopOutputML + 'testW.label.p2.txt' fpLabelP3Test = fopOutputML + 'testW.label.p3.txt' sys.stdout = open(fpResultDetails, 'w') TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True) fields = [('label', LABEL), ('text', TEXT)] # loading custom dataset p1 train_data = data.TabularDataset(path=fpLabelP1Train, format='csv', fields=fields, skip_header=True) valid_data = data.TabularDataset(path=fpLabelP1Valid, format='csv', fields=fields, skip_header=True) test_data = data.TabularDataset(path=fpLabelP1Test, format='csv', fields=fields, skip_header=True) acc_p1 = trainAndEval(train_data, valid_data, test_data) TEXT = data.Field(tokenize='spacy', batch_first=True, include_lengths=True) LABEL = data.LabelField(dtype=torch.long, batch_first=True, use_vocab=True) fields = [('label', LABEL), ('text', TEXT)] # loading custom dataset p2
def main(): print("Using device: {}" "\n".format(str(device))) # Load the training dataset, and create a dataloader to generate a batch. textField = data.Field(lower=True, include_lengths=True, batch_first=True, tokenize=student.tokenise, preprocessing=student.preprocessing, postprocessing=student.postprocessing, stop_words=student.stopWords) labelField = data.Field(sequential=False, use_vocab=False, is_target=True) dataset = data.TabularDataset( 'train.json', 'json', { 'reviewText': ('reviewText', textField), 'rating': ('rating', labelField), 'businessCategory': ('businessCategory', labelField) }) textField.build_vocab(dataset, vectors=student.wordVectors) # Allow training on the entire dataset, or split it for training and validation. if student.trainValSplit == 1: trainLoader = data.BucketIterator(dataset, shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) else: train, validate = dataset.split(split_ratio=student.trainValSplit) trainLoader, valLoader = data.BucketIterator.splits( (train, validate), shuffle=True, batch_size=student.batchSize, sort_key=lambda x: len(x.reviewText), sort_within_batch=True) # Get model and optimiser from student. net = student.net.to(device) lossFunc = student.lossFunc optimiser = student.optimiser # Train. for epoch in range(student.epochs): runningLoss = 0 for i, batch in enumerate(trainLoader): # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to(device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # PyTorch calculates gradients by accumulating contributions to them # (useful for RNNs). Hence we must manually set them to zero before # calculating them. optimiser.zero_grad() # Forward pass through the network. ratingOutput, categoryOutput = net(inputs, length) loss = lossFunc(ratingOutput, categoryOutput, rating, businessCategory) # Calculate gradients. loss.backward() # Minimise the loss according to the gradient. optimiser.step() runningLoss += loss.item() if i % 32 == 31: print("Epoch: %2d, Batch: %4d, Loss: %.3f" % (epoch + 1, i + 1, runningLoss / 32)) runningLoss = 0 # Save model. torch.save(net.state_dict(), 'savedModel.pth') print("\n" "Model saved to savedModel.pth") # Test on validation data if it exists. if student.trainValSplit != 1: net.eval() correctRatingOnlySum = 0 correctCategoryOnlySum = 0 bothCorrectSum = 0 with torch.no_grad(): for batch in valLoader: # Get a batch and potentially send it to GPU memory. inputs = textField.vocab.vectors[batch.reviewText[0]].to( device) length = batch.reviewText[1].to(device) rating = batch.rating.to(device) businessCategory = batch.businessCategory.to(device) # Convert network output to integer values. ratingOutputs, categoryOutputs = student.convertNetOutput( *net(inputs, length)) # Calculate performance correctRating = rating == ratingOutputs.flatten() correctCategory = businessCategory == categoryOutputs.flatten() correctRatingOnlySum += torch.sum(correctRating & ~correctCategory).item() correctCategoryOnlySum += torch.sum(correctCategory & ~correctRating).item() bothCorrectSum += torch.sum(correctRating & correctCategory).item() correctRatingOnlyPercent = correctRatingOnlySum / len(validate) correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate) bothCorrectPercent = bothCorrectSum / len(validate) neitherCorrectPer = 1 - correctRatingOnlyPercent \ - correctCategoryOnlyPercent \ - bothCorrectPercent score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent + 0.1 * correctRatingOnlyPercent) print("\n" "Rating incorrect, business category incorrect: {:.2%}\n" "Rating correct, business category incorrect: {:.2%}\n" "Rating incorrect, business category correct: {:.2%}\n" "Rating correct, business category correct: {:.2%}\n" "\n" "Weighted score: {:.2f}".format(neitherCorrectPer, correctRatingOnlyPercent, correctCategoryOnlyPercent, bothCorrectPercent, score))
def caption_iterator(cfg, batch_size, phase): print(f'Contructing caption_iterator for "{phase}" phase') spacy_en = spacy.load('en') def tokenize_en(txt): return [token.text for token in spacy_en.tokenizer(txt)] CAPTION = data.ReversibleField(tokenize='spacy', init_token=cfg.start_token, eos_token=cfg.end_token, pad_token=cfg.pad_token, lower=True, batch_first=True, is_target=True) INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True) # the order has to be the same as in the table fields = [ ('video_id', None), ('caption', CAPTION), ('start', None), ('end', None), ('duration', None), ('phase', None), ('idx', INDEX), ] dataset = data.TabularDataset( path=cfg.train_meta_path, format='tsv', skip_header=True, fields=fields, ) CAPTION.build_vocab(dataset.caption, min_freq=cfg.min_freq_caps, vectors=cfg.word_emb_caps) train_vocab = CAPTION.vocab if phase == 'val_1': dataset = data.TabularDataset(path=cfg.val_1_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'val_2': dataset = data.TabularDataset(path=cfg.val_2_meta_path, format='tsv', skip_header=True, fields=fields) elif phase == 'learned_props': dataset = data.TabularDataset(path=cfg.val_prop_meta_path, format='tsv', skip_header=True, fields=fields) # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption)) datasetloader = data.BucketIterator(dataset, batch_size, sort_key=lambda x: 0, device=torch.device(cfg.device), repeat=False, shuffle=True) return train_vocab, datasetloader
def test_build_vocab(self): # Set up fields question_field = data.Field(sequential=True) label_field = data.Field(sequential=False) # Write TSV dataset and construct a Dataset self.write_test_ppid_dataset(data_format="tsv") tsv_fields = [("id", None), ("q1", question_field), ("q2", question_field), ("label", label_field)] tsv_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="tsv", fields=tsv_fields) # Write JSON dataset and construct a Dataset self.write_test_ppid_dataset(data_format="json") json_fields = {"question1": ("q1", question_field), "question2": ("q2", question_field), "label": ("label", label_field)} json_dataset = data.TabularDataset( path=self.test_ppid_dataset_path, format="json", fields=json_fields) # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset, specials=['<space>']) assert question_field.vocab.freqs == Counter( {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2}) expected_stoi = {'<unk>': 0, '<pad>': 1, '<space>': 2, 'Lincoln': 3, 'When': 4, 'born?': 5, 'do': 6, 'instead': 7, 'of': 8, 'use': 9, 'was': 10, 'you': 11, '"&"': 12, '"and"?': 13, '2+2': 14, '2+2=?': 15, 'Abraham': 16, 'What': 17, 'Where': 18, 'Which': 19, 'is': 20, 'location': 21, 'し?': 22, 'シ': 23} assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos # Test build_vocab default question_field.build_vocab(tsv_dataset, json_dataset) assert question_field.vocab.freqs == Counter( {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2}) expected_stoi = {'<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9, 'you': 10, '"&"': 11, '"and"?': 12, '2+2': 13, '2+2=?': 14, 'Abraham': 15, 'What': 16, 'Where': 17, 'Which': 18, 'is': 19, 'location': 20, 'し?': 21, 'シ': 22} assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert question_field.vocab.itos == expected_itos label_field.build_vocab(tsv_dataset, json_dataset) assert label_field.vocab.freqs == Counter({'1': 4, '0': 2}) expected_stoi = {'1': 1, '0': 2, '<unk>': 0} assert dict(label_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert label_field.vocab.itos == expected_itos # Test build_vocab with extra kwargs passed to Vocab question_field.build_vocab(tsv_dataset, json_dataset, max_size=8, min_freq=3) assert question_field.vocab.freqs == Counter( {'When': 4, 'do': 4, 'you': 4, 'use': 4, 'instead': 4, 'of': 4, 'was': 4, 'Lincoln': 4, 'born?': 4, 'シ': 2, 'し?': 2, 'Where': 2, 'What': 2, 'is': 2, '2+2': 2, '"&"': 2, '"and"?': 2, 'Which': 2, 'location': 2, 'Abraham': 2, '2+2=?': 2}) expected_stoi = {'<unk>': 0, '<pad>': 1, 'Lincoln': 2, 'When': 3, 'born?': 4, 'do': 5, 'instead': 6, 'of': 7, 'use': 8, 'was': 9} assert dict(question_field.vocab.stoi) == expected_stoi # Turn the stoi dictionary into an itos list expected_itos = [x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])] assert question_field.vocab.itos == expected_itos
def test_stratified_dataset_split(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Default split ratio expected_train_size = 21 expected_test_size = 9 train, test = dataset.split(stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Test strata_field argument train, test = dataset.split(split_ratio=split_ratio, stratified=True, strata_field='label') assert len(train) == expected_train_size assert len(test) == expected_test_size # Test invalid field name strata_field = 'dummy' with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio, stratified=True, strata_field=strata_field) # Test uneven stratify sizes num_examples, num_labels = 28, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) # 10 examples for class 1 and 9 examples for classes 2,3 dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) expected_train_size = 7 + 6 + 6 expected_test_size = 3 + 3 + 3 train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 6 + 5 + 5 expected_valid_size = 1 + 1 + 1 expected_test_size = 3 + 3 + 3 train, valid, test = dataset.split(split_ratio=split_ratio, stratified=True) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size
def test_dataset_split_arguments(self): num_examples, num_labels = 30, 3 self.write_test_splitting_dataset(num_examples=num_examples, num_labels=num_labels) text_field = data.Field() label_field = data.LabelField() fields = [('text', text_field), ('label', label_field)] dataset = data.TabularDataset(path=self.test_dataset_splitting_path, format="csv", fields=fields) # Test default split ratio (0.7) expected_train_size = 21 expected_test_size = 9 train, test = dataset.split() assert len(train) == expected_train_size assert len(test) == expected_test_size # Test array arguments with same ratio split_ratio = [0.7, 0.3] train, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(test) == expected_test_size # Add validation set split_ratio = [0.6, 0.3, 0.1] expected_train_size = 18 expected_valid_size = 3 expected_test_size = 9 train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test ratio normalization split_ratio = [6, 3, 1] train, valid, test = dataset.split(split_ratio=split_ratio) assert len(train) == expected_train_size assert len(valid) == expected_valid_size assert len(test) == expected_test_size # Test only two splits returned for too small valid split size split_ratio = [0.66, 0.33, 0.01] expected_length = 2 splits = dataset.split(split_ratio=split_ratio) assert len(splits) == expected_length # Test invalid arguments split_ratio = 1.1 with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = -1. with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [0.7] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = [1, 2, 3, 4] with pytest.raises(AssertionError): dataset.split(split_ratio=split_ratio) split_ratio = "string" with pytest.raises(ValueError): dataset.split(split_ratio=split_ratio)
# use_vocab: 是否使用Vocab,否则Field的对象是数字类型的 # pad_token: 用于填充文本的关键字 # unk_token: 用于填充不在词汇表中的关键字 TEXT = data.Field(sequential=True, tokenize=customize_tokensize, include_lengths=True, use_vocab=True, batch_first=True, fix_length=200) LABEL = data.Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None) train_fileds = [("text", TEXT), ("label", LABEL)] train_data = data.TabularDataset(path=r"./imdb_data.csv", format='csv', skip_header=True, fields=train_fileds) train_data_real, val_data_real = train_data.split(split_ratio=0.7) vec = Vectors("glove.6B.100d.txt", "./Emotion") # 将训练集转换为词向量 TEXT.build_vocab(train_data_real, max_size=20000, vectors=vec) LABEL.build_vocab(train_data_real) # print(TEXT.vocab.freqs.most_common(n=10)) # print("类别标签情况: ", LABEL.vocab.freqs) # print("词典个数: ", len(TEXT.vocab.itos)) # 定义加载器 train_iter = data.BucketIterator(train_data_real, batch_size=BATCH_SIZE) val_iter = data.BucketIterator(val_data_real, batch_size=BATCH_SIZE) INPUT_DIM = len(TEXT.vocab) # 词典数量
from torchtext.legacy import data from tqdm import tqdm import torch import torch.nn as nn from sklearn.metrics import accuracy_score, confusion_matrix device = "cpu" # dataset LABEL = data.LabelField() POST = data.Field(tokenize="spacy", lower=True, tokenizer_language="en_core_web_sm") fields = [("body", POST), ("label", LABEL)] dataset = data.TabularDataset(path="pytorch_data.csv", format="CSV", fields=fields) train, test = dataset.split(split_ratio=[0.8, 0.2]) # vocabulary POST.build_vocab(train, max_size=10000) # , vectors = 'glove.6B.200d') LABEL.build_vocab(train) # fixes `"LabelField" has no attribute "vocab"` # data loaders train_iterator, test_iterator = data.BucketIterator.splits( (train, test), batch_size=32, device=device, sort_key=lambda x: x.body, # fixes weird error sort_within_batch=True, # fixes weird error )