def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings # Args: word_vectors: word vector tensor of dimension [vocab_size * wemb_dim] log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get Model log.info('Building Model...') model = QANet(word_vectors, char_vectors, args.para_limit, args.ques_limit, args.f_model, num_head=args.num_head, train_cemb = (not args.pretrained_char)) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam( params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss_f = torch.nn.CrossEntropyLoss() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') """ model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) """ ''' model = charBiDAF(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) ''' model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) print( "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________" ) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() #print("Model status - ", cuda_or_cpu(model)) ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300 train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() #print(torch.cuda.memory_allocated(device=None)) scheduler.step(step // batch_size) ema(model, step // batch_size) #torch.cuda.empty_cache() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') # print('Memory 1: ', torch.cuda.memory_allocated()) ema.assign(model) # print('Memory 2: ', torch.cuda.memory_allocated()) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train(config): from models import QANet with open(config.word_emb_file, "r") as fh: word_mat = np.array(json.load(fh), dtype=np.float32) with open(config.char_emb_file, "r") as fh: char_mat = np.array(json.load(fh), dtype=np.float32) with open(config.train_eval_file, "r") as fh: train_eval_file = json.load(fh) with open(config.dev_eval_file, "r") as fh: dev_eval_file = json.load(fh) with open(config.dev_meta, "r") as fh: meta = json.load(fh) train_log = open(config.train_log, "w") dev_total = meta["total"] print("Building model...") train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size) dev_dataset = SQuADDataset(config.dev_record_file, config.val_num_batches, config.batch_size) lr = config.learning_rate model = QANet(word_mat, char_mat).to(device) model.train() parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, params=parameters) crit = lr / math.log2(1000) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: crit * math.log2(ee + 1) if ee + 1 <= 1000 else lr) for ep in tqdm(range(config.num_steps), total=config.num_steps): model.zero_grad() (Cwid, Ccid, Qwid, Qcid, y1, y2, ids) = train_dataset[ep] Cwid, Ccid, Qwid, Qcid = Cwid.to(device), Ccid.to(device), Qwid.to( device), Qcid.to(device) p1, p2 = model(Cwid, Ccid, Qwid, Qcid) y1, y2 = y1.to(device), y2.to(device) loss1 = F.cross_entropy(p1, y1) loss2 = F.cross_entropy(p2, y2) loss = loss1 + loss2 loss.backward(retain_graph=True) scheduler.step() del loss, p1, p2 if (ep + 1) % config.checkpoint == 0: del Cwid, Ccid, Qwid, Qcid, y1, y2, p1, p2, loss torch.cuda.empty_cache() metric = evaluate_batch(model, dev_eval_file, dev_dataset) log_ = "EPOCH {:8d} loss {:8f} F1 {:8f} EM {:8f}\n".format( ep, metric["loss"], metric["f1"], metric["exact_match"]) train_log.write(log_) train_log.flush() dev_f1 = metric["f1"] dev_em = metric["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > config.early_stop: break else: patience = 0 best_em = max(best_em, dev_em) best_f1 = max(best_f1, dev_f1) fn = os.path.join(config.save_dir, "model_{}.ckpt".format(ep)) torch.save(model, fn, pickle_protocol=False)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # setup_args = get_setup_args() with open(args.char2idx_file, "r") as f: char2idx = json_load(f) # Get model log.info('Building model...') model = QANet(word_vectors=word_vectors, char2idx=char2idx) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.8, 0.999)) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') dataset1 = np.load(args.train_record_file) outfile = '/content/dataset/train_light.npz' n_row = 75000 np.savez(outfile, context_idxs=dataset1['context_idxs'][:n_row], context_char_idxs=dataset1['context_char_idxs'][:n_row], ques_idxs=dataset1['ques_idxs'][:n_row], ques_char_idxs=dataset1['ques_char_idxs'][:n_row], y1s=dataset1['y1s'][:n_row], y2s=dataset1['y2s'][:n_row], ids=dataset1['ids'][:n_row]) # train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_dataset = SQuAD(outfile, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() if step % 10000 == 0: print('loss val: {}'.format(loss_val)) # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get word embeddings log.info('Loading word embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ### start our code: # Get char-embeddings log.info('Loading char-embeddings...') char_vectors = util.torch_from_json(args.char_emb_file) ### end our code # Get model log.info('Building model...') # model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size, # char_vectors = char_vectors, # drop_prob=args.drop_prob) ### start our code: (QANet) model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, kernel_size=7, filters=128, drop_prob=args.drop_prob) ### end our code model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # https://pytorch.org/docs/stable/optim.html # Original: # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # # default: lr=0.5, rho=0.9, eps=1e-06, weight_decay=0 # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # A function which computes a multiplicative factor given an integer parameter epoch, # or a list of such functions, one for each group in optimizer.param_groups ### start our code: optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, amsgrad=False) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Adabound # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) ### end our code # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # print("cc_idxs: ", cc_idxs) # print("qc_idxs: ", qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) ### start our code: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) ### end our code batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) # original ### start our code: log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) ### end our code y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ### start our code: # optimizer = LR_decay(optimizer, epoch, lr = 0.8) # initial learning rate 0.8 if step < 1000: optimizer = LR_warmup(optimizer, step, lr=0.) ### end our code optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)