def get_iterator(self, dataset): if self.cuda: iterator = data.BPTTIterator(dataset, sort_key = None,\ bptt_len = self.seq_len, batch_size = self.batch_size) iterator.repeat = False else: iterator = data.BPTTIterator(dataset, sort_key = None,\ bptt_len = self.seq_len, batch_size = self.batch_size, device = -1) iterator.repeat = False print("Created Iterator with {num} batches".format(num=len(iterator))) return iterator
def get_test_iter(self, file_path: str, batch_size: int) -> BatchIterator: """ Get test data iterator from test data file. Args: file_path (str): Path to test data file. batch_size (int): Batch size Returns: BatchIterator: An instance of BatchIterator to iterate over the supplied test data file. """ test_data = self.gen_dataset_from_path(file_path) return BatchIterator( textdata.BPTTIterator( test_data, batch_size=batch_size, bptt_len=self.bptt_len, device="cuda:{}".format(torch.cuda.curren_device()) if cuda_utils.CUDA_ENABLED else "cpu", sort=True, repeat=False, train=False, sort_key=self.sort_key, ), self._postprocess_batch, )
def _get_train_iter( self, train_dataset: textdata.Dataset, batch_size: int, rank: int = 0, world_size: int = 1, ) -> BatchIterator: dataset_shard, max_num_examples = self._get_dataset_shard( train_dataset, rank, world_size) # Compute the per-worker batch size assert (batch_size >= world_size ), "batch size needs to be >= the distributed world size" batch_size = batch_size // world_size return BatchIterator( textdata.BPTTIterator( dataset_shard, batch_size=batch_size, bptt_len=self.bptt_len, device="cuda:{}".format(torch.cuda.curren_device()) if cuda_utils.CUDA_ENABLED else "cpu", sort_within_batch=True, repeat=False, sort_key=self.sort_key, ), self._postprocess_batch, num_batches=math.ceil(max_num_examples / float(batch_size)), )
def __init__(self, root_dir, batch_size=32, length=100): self.root_dir = root_dir self.field = data.Field(sequential=True, lower=False) all_datasets = datasets.PennTreebank.splits(text_field=self.field, root=self.root_dir) self.train, self.valid, self.test = all_datasets self.train_iter = data.BPTTIterator(dataset=self.train, batch_size=batch_size, bptt_len=length) self.field.build_vocab(self.train)
def _make_iter(cls, dataset, batch_size, bptt_len): if dataset: _iter = data.BPTTIterator( dataset, batch_size=batch_size, bptt_len=bptt_len, # this is where we specify the sequence length repeat=False, shuffle=True, ) else: _iter = [] return _iter
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data', default='data/ptb_char') parser.add_argument('--model', required=True) parser.add_argument('--config', required=True) parser.add_argument('--gpu', default=-1, type=int) args = parser.parse_args() with open(args.config, 'r') as f: config = yaml.load(f) pprint(config) text_field = PTBCharTextField() train_dataset, test_dataset = PTBChar.splits(path=args.data, validation=None, text_field=text_field) text_field.build_vocab(train_dataset) test_loader = data.BPTTIterator(dataset=test_dataset, batch_size=1, bptt_len=2000, train=False, device=args.gpu) model = PTBModel(num_chars=len(text_field.vocab), **config['model']) model.load_state_dict(torch.load(args.model)) print(model) num_params = sum(p.numel() for p in model.parameters()) print(f'Total parameters: {num_params}') if args.gpu > -1: model.cuda(args.gpu) model.eval() state = hyper_state = None test_bpc_sum = test_bpc_denom = 0 for test_batch in tqdm(test_loader): test_inputs = test_batch.text test_targets = test_batch.target test_logits, state, hyper_state = model(inputs=test_inputs, state=state, hyper_state=hyper_state) test_loss = sequence_cross_entropy(logits=test_logits, targets=test_targets) test_bpc_sum += (test_loss.data[0] / np.log(2)) * test_inputs.size(0) test_bpc_denom += test_inputs.size(0) test_bpc = test_bpc_sum / test_bpc_denom print(f'Test BPC = {test_bpc:.6f}')
def init_dataloaders(self): print('Initializing dataloaders') project_path = self.config.firelab.project_path data_path_train = os.path.join(project_path, self.config.data.train) data_path_val = os.path.join(project_path, self.config.data.val) data_train = open(data_path_train).read().splitlines() data_val = open( data_path_val).read().splitlines()[:self.config.val_set_size] self.eos = '|' field = Field(eos_token=self.eos, batch_first=True, tokenize=char_tokenize) train_examples = [ Example.fromlist([self.eos.join(data_train)], [('text', field)]) ] val_examples = [ Example.fromlist([s], [('text', field)]) for s in data_val ] self.train_ds = Dataset(train_examples, [('text', field)]) self.val_ds = Dataset(val_examples, [('text', field)]) field.build_vocab(self.train_ds) self.vocab = field.vocab self.train_dataloader = data.BPTTIterator(self.train_ds, self.config.hp.batch_size, self.config.hp.batch_len, repeat=False) self.val_dataloader = data.BucketIterator(self.val_ds, 1, shuffle=False, repeat=False) print('Dataloaders initialized!')
def __init__(self, config, lm_config, device): # define all fields TEXT = data.ReversibleField(sequential=True, tokenize=self.tokenizer, lower=False, include_lengths=False) POS = data.ReversibleField(sequential=True, lower=False, include_lengths=True) NER = data.ReversibleField(sequential=True, lower=False, include_lengths=True) LABEL = data.Field(sequential=False, use_vocab=False) IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) LEMMA_IN_Q = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) LEMMA_IN_C = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) TF = data.Field(sequential=True, use_vocab=False, include_lengths=True, postprocessing=self.to_numeric) REL = data.ReversibleField(sequential=True, lower=False, include_lengths=True) # load lm data first lm_train = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.train_f), TEXT, newline_eos=False) lm_dev = datasets.LanguageModelingDataset(os.path.join(lm_config.file_path, lm_config.dev_f), TEXT, newline_eos=False) # load actual data # we have keys: 'id', 'd_words', 'd_pos', 'd_ner', 'q_words', 'q_pos', 'c_words', # 'label', 'in_q', 'in_c', 'lemma_in_q', 'tf', 'p_q_relation', 'p_c_relation' train, val, test = data.TabularDataset.splits( path=config.data_dir, train=config.train_fname, validation=config.dev_fname, test=config.test_fname, format='json', fields={'d_words': ('d_words', TEXT), 'd_pos': ('d_pos', POS), 'd_ner': ('d_ner', NER), 'q_words': ('q_words', TEXT), 'q_pos': ('q_pos', POS), 'c_words': ('c_words', TEXT), 'label': ('label', LABEL), 'in_q': ('in_q', IN_Q), 'in_c': ('in_c', IN_C), 'lemma_in_q': ('lemma_in_q', LEMMA_IN_Q), 'lemma_in_c': ('lemma_in_c', LEMMA_IN_C), 'tf': ('tf', TF), 'p_q_relation': ('p_q_relation', REL), 'p_c_relation': ('p_c_relation', REL) }) print('train: %d, val: %d, test: %d' % (len(train), len(val), len(test))) # construct vocabulary TEXT.build_vocab(train, val, test, lm_train, lm_dev, vectors=config.vectors) POS.build_vocab(train, val, test) NER.build_vocab(train, val, test) REL.build_vocab(train, val, test) print('vocab size: %d' % len(TEXT.vocab)) print('pos size: %d' % len(POS.vocab)) print('ner size: %d' % len(NER.vocab)) print('rel size: %d' % len(REL.vocab)) self.TEXT = TEXT # iterators self.lm_train_iter = data.BPTTIterator(lm_train, batch_size=lm_config.batch_size, bptt_len=lm_config.bptt_len, repeat=False) self.lm_dev_iter = data.BPTTIterator(lm_dev, batch_size=lm_config.batch_size, bptt_len=lm_config.bptt_len, repeat=False) print('lm train batch num: %d, lm dev batch num: %d' % (len(self.lm_train_iter), len(self.lm_dev_iter))) self.train_iter = data.BucketIterator(dataset=train, batch_size=config.batch_size_train, sort_key=lambda x: len(x.d_words), device=device, shuffle=True, sort_within_batch=False, repeat=False) self.val_iter = data.Iterator(dataset=val, batch_size=config.batch_size_eval, sort_key=lambda x: len(x.d_words), train=False, shuffle=False, sort_within_batch=False, device=device, repeat=False) self.test_iter = data.Iterator(dataset=test, batch_size=config.batch_size_test, sort_key=lambda x: len(x.d_words), train=False, shuffle=False, sort_within_batch=False, device=device, repeat=False) print('train batch num: %d, dev batch num: %d' % (len(self.train_iter), len(self.val_iter))) # # Create embeddings embedding = nn.Embedding(len(TEXT.vocab), config.embed_dim) embedding.weight.data.copy_(TEXT.vocab.vectors) embedding.weight.requires_grad = False self.embedding = embedding.to(device) embedding_pos = nn.Embedding(len(POS.vocab), config.embed_dim_pos) embedding_pos.weight.data.normal_(0, 0.1) self.embedding_pos = embedding_pos.to(device) embedding_ner = nn.Embedding(len(NER.vocab), config.embed_dim_ner) embedding_ner.weight.data.normal_(0, 0.1) self.embedding_ner = embedding_ner.to(device) embedding_rel = nn.Embedding(len(REL.vocab), config.embed_dim_rel) embedding_rel.weight.data.normal_(0, 0.1) self.embedding_rel = embedding_rel.to(device) print('embedding', self.embedding) print('embedding_pos', self.embedding_pos) print('embedding_ner', self.embedding_ner) print('embedding_rel', self.embedding_rel) self.vocab_size = len(TEXT.vocab) print('vocab_size is', self.vocab_size)
def train(flags): my_tok = spacy.load('en') def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(x)] TEXT = data.Field(lower=True, tokenize=spacy_tok) dataset = torchtext.datasets.LanguageModelingDataset( flags.train_file, TEXT) dataset[0].text = dataset[0].text[::-1] if flags.custom_embeddings: custom_embeddings = torchtext.vocab.Vectors( name=os.path.abspath(flags.custom_embeddings)) TEXT.build_vocab(dataset, vectors=custom_embeddings) else: TEXT.build_vocab(dataset, vectors="glove.6B.300d") weight_matrix = TEXT.vocab.vectors vocab = TEXT.vocab os.makedirs(flags.save_dir, exist_ok=True) with open(os.path.join(flags.save_dir, 'vocab.pkl'), 'wb') as vocab_file: pickle.dump(vocab, vocab_file) train_iter = data.BPTTIterator( dataset, batch_size=flags.batch_size, bptt_len=flags. seq_size, # this is where we specify the sequence length device=torch.device("cuda:0"), repeat=False) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_vocab, emb_size = weight_matrix.shape net = LSTMModel(n_vocab, emb_size, flags.lstm_size, flags.lstm_layers) net.embedding.weight.data.copy_(weight_matrix) net.set_vocab(vocab) net = net.to(device) criterion, optimizer = get_loss_and_train_op(net, flags.learning_rate) iteration = 0 for e in range(flags.n_epoch): state_h, state_c = net.zero_state(flags.batch_size) # Transfer data to GPU state_h = state_h.to(device) state_c = state_c.to(device) for batch in train_iter: x, y = batch.text, batch.target iteration += 1 # Tell it we are in training mode net.train() # Reset all gradients optimizer.zero_grad() # Transfer data to GPU x = torch.tensor(x).to(device) y = torch.tensor(y).to(device) logits, (state_h, state_c) = net(x, (state_h, state_c)) loss = criterion(logits.transpose(1, 2), y) state_h = state_h.detach() state_c = state_c.detach() loss_value = loss.item() # Perform back-propagation loss.backward() _ = torch.nn.utils.clip_grad_norm_(net.parameters(), flags.gradients_norm) # Update the network's parameters optimizer.step() if iteration % 100 == 0: print('Epoch: {}/{}'.format(e, flags.n_epoch), 'Iteration: {}'.format(iteration), 'Loss: {}'.format(loss_value)) if iteration % 1000 == 0: predict(device, net, ['the end'], vocab, top_k=2) torch.save(net.state_dict(), os.path.join(flags.save_dir, 'model-last.pth'))
TEXT = data.Field(eos_token=EOS_TOKEN, init_token=BOS_TOKEN, unk_token=UNK_TOKEN, batch_first=False) train_data = datasets.LanguageModelingDataset(path=os.path.join( DATA_BASE_PATH, DATA_DIR, DATA_TRAIN_FILE_NAME), text_field=TEXT) valid_data = datasets.LanguageModelingDataset(path=os.path.join( DATA_BASE_PATH, DATA_DIR, DATA_VALID_FILE_NAME), text_field=TEXT) test_data = datasets.LanguageModelingDataset(path=os.path.join( DATA_BASE_PATH, DATA_DIR, DATA_TEST_FILE_NAME), text_field=TEXT) TEXT.build_vocab(train_data) train_iter = data.BPTTIterator(dataset=train_data, batch_size=BATCH_SIZE, bptt_len=BPTT_LEN, device=device) valid_iter = data.BPTTIterator(dataset=valid_data, batch_size=BATCH_SIZE, bptt_len=BPTT_LEN, device=device) test_iter = data.BPTTIterator(dataset=test_data, batch_size=BATCH_SIZE, bptt_len=BPTT_LEN, device=device) # build model MODEL_SAVE_BASE_PATH = '/home/ubuntu/likun/nlp-practice/language_model' MODEL_NAME = "PTB-RNN-KERNEL.pt" MODEL_SAVE_PATH = os.path.join(MODEL_SAVE_BASE_PATH, 'save_models', MODEL_NAME)
train_ptb, valid_ptb, test_ptb = datasets.PennTreebank.splits( TEXT, root="treebank.data") print("PTB datasets constructed.") TEXT.build_vocab(train_ptb, valid_ptb, test_ptb, train_tweets, valid_tweets) #9.733 with only PTB, 27.780 in total print("Vobabulary built.") # create model model = Mikolov(len(TEXT.vocab)) #last_checkpoint = torch.load("saved/treebank_for_20_epochs.pt") #model.load_state_dict(last_checkpoint['model_state_dict']) # create iterators for training # here we train on twitter dataset but training on PTB is equivalent train_iter = data.BPTTIterator(train_tweets, batch_size=1, bptt_len=64) valid_iter = data.BPTTIterator(valid_tweets, batch_size=1, bptt_len=64) epochs = 0 valid_losses = [] # validation while (True): print("Training epoch #" + str(epochs + 1) + " starts.") bptt_trainer(model, train_iter) epochs += 1 print("Epoch #" + str(epochs) + " completed.") valid_loss = validator(model, valid_iter) print("Averaged loss on validation set: " + str(valid_loss)) valid_losses.append(valid_loss)
# Fields are added by column left to write in the underlying table fields=[('name', NAMES), ('label', LABELS), ('text', TEXT)] train, dev, test = data.TabularDataset.splits( path='.', format='CSV', fields=fields, train='train.csv', validation='dev.csv', test='test.csv') TEXT.build_vocab(train) # TEXT.vocab.itos[1] ... '<pad>' # TEXT.vocab.itos[0] ... '<unk>' LABELS.build_vocab(train) a = next(iter(data.BPTTIterator(train, 20, 20))) train_iter, dev_iter, test_iter = data.BPTTIterator.splits( ([i.text for i in train], dev, test), bptt_len=13, batch_size=7, sort_key=lambda x: len(x.text), device='cpu') # https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/ from torchtext.datasets import WikiText2 train, valid, test = WikiText2.splits(TEXT) # loading custom datas
# Other Stuff TEST_TEXT = data.Field(lower=True, tokenize=spacy_tok) trainSet, valid, test = datasets.WikiText2.splits(TEST_TEXT) myTestSet = datasets.LanguageModelingDataset( path="F:\Πτυχιακη\Lab4A\.data\wikitext-2\wikitext-2\TestMine.tokens", text_field=TEST_TEXT) #The Vocabulary is constructed from the dataset so we need to load the large one for more Variety TEST_TEXT.build_vocab(trainSet, vectors="glove.6B.200d") myWeight_matrix = TEST_TEXT.vocab.vectors #Create Iterator over my input myTestIter = data.BPTTIterator(dataset=myTestSet, batch_size=batchSize, bptt_len=1, device=torch.device("cuda:0"), repeat=False) #Load model #model = RNNModel(myWeight_matrix.size(0), myWeight_matrix.size()[1], 200, 1, 1) #Model for special case in tokenizer model = RNN_GRUModel(28869, myWeight_matrix.size()[1], 201, 1, batchSize) ##Model for No special case in tokenizer #model = RNNModel(28870, myWeight_matrix.size()[1], 200, 1, bsz=1) # model.encoder.weight.data.copy_(myWeight_matrix) # Comment if no model exists
from nntoolbox.metrics import * MAX_VOCAB_SIZE = 25000 BATCH_SIZE = 16 TEXT = data.Field(tokenize='spacy') LABEL = data.LabelField(dtype=torch.float) # train_iterator, val_iterator, test_iterator = WikiText2.iters() # for tmp in train_iterator: # print(tmp) train_data, val_data, test_data = WikiText2.splits(TEXT) train_iterator = data.BPTTIterator(train_data, batch_size=BATCH_SIZE, sort_within_batch=True, device=get_device(), bptt_len=35, shuffle=True) val_iterator = data.BPTTIterator(val_data, batch_size=BATCH_SIZE, sort_within_batch=True, device=get_device(), bptt_len=35, shuffle=True) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d") embedding = AdditiveContextEmbedding(num_embeddings=len(TEXT.vocab), embedding_dim=100) load_embedding(embedding, TEXT.vocab.vectors)
def main(args): device = torch.device('cuda' if args.gpu else 'cpu') if args.re_training is None: TEXT = data.Field( lower=True, init_token='<bos>', eos_token='<eos>' ) else: basedir, _ = os.path.split(args.re_training) path = os.path.join(basedir, 'text.field') TEXT = utils.load_field(path) fields = [('text', TEXT)] if args.task in monolingual_tasks \ else [('src', TEXT), ('tgt', TEXT)] slen_filter = lambda x: args.src_minlen <= len(x.src) <= args.src_maxlen \ and args.tgt_minlen <= len(x.tgt) <= args.tgt_maxlen # load training data if args.task == 'translation': train_data = data.TabularDataset( path=args.train, format='tsv', fields=fields, filter_pred=slen_filter, ) else: # `causal`, `masked` train_data = datasets.LanguageModelingDataset( path=args.train, text_field=TEXT, newline_eos=True ) # set Vocabulary object if args.re_training is None: TEXT.build_vocab( train_data, min_freq=args.min_freq, specials=['<sep>', '<mask>'], ) if args.embed_path: vectors = utils.load_vector(args.embed_path) TEXT.vocab.load_vectors(vectors) if not os.path.exists(args.savedir): os.mkdir(args.savedir) # save a field object with open(os.path.join(args.savedir, 'text.field'), 'wb') as fout: dill.dump(TEXT, fout) utils.save_vocab(args.savedir, TEXT) # set training iterator if args.task == 'translation': train_iter = data.BucketIterator( train_data, batch_size=args.batch_size, sort_within_batch=True, sort_key= lambda x: len(x.src), repeat=False, ) else: # `causal`, `masked` train_iter = data.BPTTIterator( train_data, batch_size=args.batch_size, bptt_len=args.bptt_len, train=True, repeat=False, shuffle=True, ) print(f'| [text] Dictionary: {len(TEXT.vocab.itos)} types') print('') print(f'train: {args.train}') for name, field in fields: n_tokens, n_unk = utils.get_statics(train_iter, name, field) print(f'| [{name}] {n_tokens} tokens,', end='') print(f' coverage: {100*(n_tokens-n_unk)/n_tokens:.{4}}%') print('') # build a model model_class = get_model(args.task) if args.re_training is None: epoch = 1 iteration = 0 best_loss = math.inf model = model_class(TEXT, args).to(device) else: load_vars = torch.load(args.re_training) epoch = load_vars['epoch'] + 1 iteration = load_vars['iteration'] best_loss = load_vars['best_loss'] lm_args, lm_weights = load_vars['args'], load_vars['weights'] model = model_class(TEXT, lm_args) model.load_state_dict(lm_weights) model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=TEXT.vocab.stoi['<pad>']) optimizer_fn = utils.get_optimizer(args.optimizer) optimizer = optimizer_fn(model.parameters(), lr=args.lr) trainer = Trainer(model, criterion, optimizer, args.clip, iteration) # show the details of model and optimizer print('=============== MODEL ===============') print(model) print('') print('=============== OPTIMIZER ===============') print(optimizer) print('') max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf assert not(max_epoch == math.inf and max_update == math.inf), \ 'Please set `--max-epoch` or `--max-update`.' while epoch <= max_epoch and trainer.n_updates <= max_update: # training with tqdm(train_iter, dynamic_ncols=True) as pbar: train_loss = 0.0 trainer.model.train() for samples in pbar: if args.task in monolingual_tasks: srcs = samples.text.to(device) tgts = None refs = None if args.task == 'masked' \ else samples.target.to(device) else: srcs = samples.src.to(device) tgts = samples.tgt.to(device) refs = None loss = trainer.step(srcs, tgts, refs) train_loss += loss.item() # setting of progressbar pbar.set_description(f'epoch {str(epoch).zfill(3)}') progress_state = OrderedDict( task=args.task, loss=loss.item(), ppl=math.exp(loss.item()), bsz=srcs.size(1), lr=trainer.get_lr(), clip=args.clip, num_updates=trainer.n_updates) pbar.set_postfix(progress_state) train_loss /= len(train_iter) print(f'| epoch {str(epoch).zfill(3)} | train ', end='') print(f'| loss {train_loss:.{4}} ', end='') print(f'| ppl {math.exp(train_loss):.{4}} ', end='') print(f'| lr {trainer.get_lr():.1e} ', end='') print(f'| clip {args.clip} ', end='') print(f'| num_updates {trainer.n_updates} |') # saving model save_vars = { 'epoch': epoch, 'iteration': trainer.n_updates, 'best_loss': train_loss if train_loss < best_loss else best_loss, 'args': args, 'weights': model.state_dict() } if train_loss < best_loss: best_loss = train_loss filename = os.path.join(args.savedir, 'checkpoint_best.pt') torch.save(save_vars, filename) if epoch % args.save_epoch == 0: filename = os.path.join(args.savedir, f'checkpoint_{epoch}.pt') torch.save(save_vars, filename) filename = os.path.join(args.savedir, 'checkpoint_last.pt') torch.save(save_vars, filename) # update epoch += 1