def main(args): print(args) #tokenizer = Tokenizer(tokenize_type='nltk', lowercase=True) #tokenizer = Tokenizer(tokenize_type='basic', lowercase=True) tokenizer = Tokenizer(tokenize_type='basic', lowercase=True) train_corpus = readCorpus(args.train_file, tokenizer) #val_corpus = readCorpus(args.val_file, tokenizer) # Print the top 10 words in the corpus. # IMPORTANT: complete the function within the tokenizer class in data.py first. #print("Top 10 words: %s" %(tokenizer.countTopWords(train_corpus, k=10))) #tokenizer.plot(train_corpus) # Instantiate the language model. lm = LanguageModel(tokenizer.vocab, n=2, smoothing=args.smoothing, smoothing_param=args.smoothing_param) # Figure out index for a specific percentage of train corpus to use. train_idx = int(args.train_fraction * len(train_corpus)) print("TRAINING FRACTION IS {}".format(args.train_fraction)) lm.train(train_corpus[:train_idx])
def model(corpus): tokenizer = Tokenizer(tokenize_type='basic', lowercase=True) train_corpus = readCorpus(corpus, tokenizer) lm = LanguageModel(tokenizer.vocab, n=2, smoothing='addAlpha', smoothing_param=0.001) train_idx = int(1.0 * len(train_corpus)) lm.train(train_corpus[:train_idx]) return tokenizer, lm
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() local_rank = args.local_rank # world_size = torch.cuda.device_count() # assume all local GPUs # Set up distributed process group rank = setup_dist(local_rank) # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) #model = model.to(rank) #model = DDP(model, device_ids=[rank], output_device=rank) num_pretrained = model.load_pretrained_embeddings( vocabulary, args.embedding_path ) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print( f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)' ) print() # device = torch.device(f'cuda:{rank}') model = model.to(rank) model = DDP(model, device_ids=[rank], output_device=rank) # if args.use_gpu: # model = cuda(args, model) if args.resume and args.model_path: map_location = {"cuda:0": "cuda:{}".format(rank)} model.load_state_dict(torch.load(args.model_path, map_location=map_location)) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. try: train_loss = train(args, epoch, model, train_dataset) except RuntimeError: print(f'NCCL Wait Timeout, rank: \'{args.local_rank}\' (exit)') exit(1) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. if rank == 0: eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print( f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}" ) # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print( f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...' ) print() cleanup_dist() break if args.do_test and rank == 0: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ( 'python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}' ) print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path, is_train=True) dev_dataset = QADataset(args, args.dev_path, is_train=False) print("Start creating vocabulary and tokenizer") # Create vocabulary and tokenizer. vocabulary = Vocabulary( train_dataset.samples + train_dataset.culled_samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def closure(): model.zero_grad() loss, acc = model.train_step(b) loss.backward() return loss model.optim.step(closure) self.num_steps += 1 self.epoch_tqdm.close() if __name__ == "__main__": print('prepare bpe tok ...') bpe_tok = src_tok = Tokenizer( 'en', ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'], '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab') trg_tok = Tokenizer( 'de', ['bpe:/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share'], '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000/bpe.37000.share.vocab') print("prepare model ...") model = GNMT(src_tok, trg_tok, 512, 0.1).cuda() print('setup dataset ...') dataset = Dataset(src_tok, trg_tok, '/pvc/minNMT/data/wmt14.en-de/bpe.37000.h100000') trainer = Trainer() trainer.fit(model, dataset)
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. if args.vocab_path != None: print("loading vocabulary from file at {}".format(args.vocab_path)) vocabulary = Vocabulary(train_dataset.samples, args.vocab_size, load_from_file=True, filepath=args.vocab_path) else: print("constructing the vocab from dataset examples") vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id args.char_vocab_size = vocabulary.numCharacters() print(f'vocab words = {len(vocabulary)}') print(f'num characters = {args.char_vocab_size}') # Print number of samples. num_train_samples = len(train_dataset) print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) # load the model from previous checkpoint if args.finetune >= 1: print("preparing to load {} as base model".format(args.init_model)) model.load_state_dict(torch.load(args.init_model, map_location='cpu')) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # create tensorboard summary writer train_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_train")) valid_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_valid")) # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset, train_writer, num_train_samples) eval_loss = evaluate(args, epoch, model, dev_dataset) # write the loss to tensorboard valid_writer.add_scalar("valid_loss", eval_loss, global_step=epoch) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print("args type: ", type(args)) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() if args.bio: print("training on bio dataset") train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) bio_len = len(train_dataset.elems) # len == 1504 print("bio data size: ", bio_len) random.shuffle(train_dataset.elems) train_dataset.elems = train_dataset.elems[:int(bio_len / 2)] dev_dataset.elems = train_dataset.elems[int(bio_len / 2):] else: # Set up datasets train_dataset = QADataset( args, args.train_path) # len == 18885, vocab_size == 50004 dev_dataset = QADataset( args, args.dev_path) # len == 2067, vocab words == 24987 if args.domain_adaptive: # NewsQA dataset print("domain adaptive training") news_train = QADataset(args, "datasets/newsqa_train.jsonl.gz" ) # len == 11428, vocab words == 24989 news_dev = QADataset( args, "datasets/newsqa_dev.jsonl.gz") # len == 638, vocab words == 18713 bio = QADataset( args, "datasets/bioasq.jsonl.gz") # len == 1504, vocab words == 18715 train_dataset.elems = train_dataset.elems + news_train.elems + news_dev.elems print("total dataset size: ", len(train_dataset.elems)) # Create vocabulary and tokenizer. vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): if args.use_EDA_aug: # randomly shuffle the data 1st print("shuffling dataset") random.shuffle(train_dataset.samples) # Perform augmentation on the training data print("performing augmentation on dataset...") train_dataset_copy = deepcopy(train_dataset) print("prob for char aug is: ", args.char_aug) augmented_train_dataset = EDA(train_dataset_copy, sr_prob=0.33, rd_prob=0.05, rs_prob=0.10, ri_prob=0.10, r_shuffle_prob=0.10, r_backtrans_prob=0.0, char_aug=args.char_aug) else: print("no augmentation") # Perform training and evaluation steps. if args.use_EDA_aug: # ADDITION: train for augmented training set, eval on the same old dev set print("training on augmented dataset") # print ("1st context of the augmented dataset looks like: ", augmented_train_dataset.elems[0]['context']) a = random.randint(0, 3) print("random num gen: ", a) print("context ex of augmented data: " + augmented_train_dataset.elems[a]['context']) print("sample ex of augmented data: " + " ".join( [token for token in augmented_train_dataset.samples[a][1]])) assert augmented_train_dataset != train_dataset train_loss = train(args, epoch, model, augmented_train_dataset) else: print("training on normal dataset") train_loss = train(args, epoch, model, train_dataset) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()