def init_training(args, word_vectors, char_vectors, device, config=None): config = config or {} model_name = args.name if model_name == 'baseline': model, optimizer, scheduler = init_baseline_training(args, word_vectors, config) else: if model_name == 'claf': model, optimizer, scheduler = init_claf_training(args, char_vectors, word_vectors, config) elif model_name == 'qanet': model, optimizer, scheduler = init_qanet_training(args, char_vectors, word_vectors, config) elif model_name == 'qanet2': model, optimizer, scheduler = init_qanet2_training(args, char_vectors, word_vectors, config) else: raise Exception("Unknown model") model = nn.DataParallel(model, args.gpu_ids) if args.load_path: model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema_decay = args.ema_decay ema = util.EMA(model, ema_decay) return model, optimizer, scheduler, ema, step
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get word embeddings log.info('Loading word embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ### start our code: # Get char-embeddings log.info('Loading char-embeddings...') char_vectors = util.torch_from_json(args.char_emb_file) ### end our code # Get model log.info('Building model...') # model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size, # char_vectors = char_vectors, # drop_prob=args.drop_prob) ### start our code: (QANet) model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, kernel_size=7, filters=128, drop_prob=args.drop_prob) ### end our code model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # https://pytorch.org/docs/stable/optim.html # Original: # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # # default: lr=0.5, rho=0.9, eps=1e-06, weight_decay=0 # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # A function which computes a multiplicative factor given an integer parameter epoch, # or a list of such functions, one for each group in optimizer.param_groups ### start our code: optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, amsgrad=False) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Adabound # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) ### end our code # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # print("cc_idxs: ", cc_idxs) # print("qc_idxs: ", qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) ### start our code: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) ### end our code batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) # original ### start our code: log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) ### end our code y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) ### start our code: # optimizer = LR_decay(optimizer, epoch, lr = 0.8) # initial learning rate 0.8 if step < 1000: optimizer = LR_warmup(optimizer, step, lr=0.) ### end our code optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') ''' TODO: YOUR MODEL HERE ''' model = None model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = MyDataset(args.train_record_file) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = MyDataset(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for features, ys, ids in train_loader: # Setup for forward batch_size = 1 # TODO: optimizer.zero_grad() # Forward outputs = model(features) y = y.to(device) loss = loss_fn(outputs, y) # TODO loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def train_QaNet(args): device, args.gpu_ids = util.get_available_devices() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") word_mat = util.torch_from_json(args.word_emb_file) char_mat = util.torch_from_json(args.char_emb_file) with open(args.dev_eval_file, 'r') as fh: dev_eval_file = json_load(fh) print("Building model...") train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_dataset = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_dataset = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) lr = args.lr base_lr = 1 lr_warm_up_num = args.lr_warm_up_num model = QaNet(word_mat, char_mat, args.connector_dim, args.glove_dim, args.char_dim, args.drop_prob, args.dropout_char, args.num_heads, args.c_len, args.q_len).to(device) ema = util.EMA(model, args.ema_decay) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.9, 0.999), eps=1e-7, weight_decay=5e-8, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(args.num_epochs): train(model, optimizer, scheduler, train_dataset, dev_dataset, dev_eval_file, iter, ema, device) ema.assign(model) metrics = test(model, dev_dataset, dev_eval_file, (iter + 1) * len(train_dataset)) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > args.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(args.save_dir, "model.pt") torch.save(model, fn) ema.resume(model)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # ###################################### # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) # train_examples = None # train_examples = read_squad_examples( # input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) # train_features = convert_examples_to_features( # examples=train_examples, # tokenizer=tokenizer, # max_seq_length=args.max_seq_length, # doc_stride=args.doc_stride, # max_query_length=args.max_query_length, # is_training=True) # if args.local_rank == -1 or torch.distributed.get_rank() == 0: # logger.info(" Saving train features into cached file %s", cached_train_features_file) # with open(cached_train_features_file, "wb") as writer: # pickle.dump(train_features, writer) # all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) # x = all_input_ids ########################################### # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # added_flag cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(cw_idxs, qw_idxs) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) # Comment out to only use 1 GPU on nv12 args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = None max_context_len, max_question_len = args.para_limit, args.ques_limit if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"): model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.model_type == "dcn" or args.model_type == "bert-dcn"): model = DCN(word_vectors=word_vectors, hidden_size=args.hidden_size, max_context_len=max_context_len, max_question_len=max_question_len, drop_prob=args.drop_prob) elif (args.model_type == "bert-basic"): model = BERT(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) if model is None: raise ValueError('Model is unassigned. Please ensure --model_type \ chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) count_skip = 0 while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: batch_size = cw_idxs.size(0) count_skip += 1 if (args.skip_examples == True and (count_skip % 5 == 1 or count_skip % 5 == 2 or count_skip % 5 == 3 or count_skip % 5 == 4)): step += batch_size progress_bar.update(batch_size) steps_till_eval -= batch_size continue # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_train_embeddings = get_embeddings( "train", ids, args.para_limit, args.ques_limit) else: bert_train_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(course_dir, text_embedding_size, audio_embedding_size, image_embedding_size, hidden_size, drop_prob, max_text_length, out_heatmaps_dir, args, batch_size=3, num_epochs=100): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Create Dataset objects text_dataset = TextDataset(course_dir, max_text_length) audio_dataset = AudioDataset(course_dir) target_dataset = TargetDataset(course_dir) # Preprocess the image in prescribed format normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomResizedCrop(256), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) image_dataset = ImageDataset(course_dir, transform) assert len(text_dataset) == len(audio_dataset) and len( audio_dataset) == len(image_dataset) and len(image_dataset) == len( target_dataset), "Unequal dataset lengths" # Creating data indices for training and validation splits: train_indices, val_indices = gen_train_val_indices(text_dataset) # Creating PT data samplers and loaders: train_sampler = torch.utils.data.SequentialSampler(train_indices) val_sampler = torch.utils.data.SequentialSampler(val_indices) # Get sentence embeddings train_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_text_loader = torch.utils.data.DataLoader(text_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get Audio embeddings train_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_audio_loader = torch.utils.data.DataLoader(audio_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Get images train_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=train_sampler) val_image_loader = torch.utils.data.DataLoader(image_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collator, sampler=val_sampler) # Load Target text train_target_loader = torch.utils.data.DataLoader( target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=train_sampler) val_target_loader = torch.utils.data.DataLoader(target_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=target_collator, sampler=val_sampler) # print("lens - train_text_loader {}, val_text_loader {}".format(len(train_text_loader), len(val_text_loader))) # print("lens - train_audio_loader {}, val_audio_loader {}".format(len(train_audio_loader), len(val_audio_loader))) # print("lens - train_image_loader {}, val_image_loader {}".format(len(train_image_loader), len(val_image_loader))) # print("lens - train_target_loader {}, val_target_loader {}".format(len(train_target_loader), len(val_target_loader))) # Create model model = MMBiDAF(hidden_size, text_embedding_size, audio_embedding_size, image_embedding_size, device, drop_prob, max_text_length) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # For exponential moving average # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Need to change the metric name # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Let's do this! loss = 0 eps = 1e-8 log.info("Training...") steps_till_eval = args.eval_steps epoch = step // len(TextDataset(course_dir, max_text_length)) while epoch != args.num_epochs: epoch += 1 log.info("Starting epoch {epoch}...") count_item = 0 loss_epoch = 0 with torch.enable_grad(), tqdm( total=len(train_text_loader.dataset)) as progress_bar: for (batch_text, original_text_lengths), ( batch_audio, original_audio_lengths), ( batch_images, original_img_lengths), (batch_target_indices, batch_source_paths, batch_target_paths, original_target_len) in zip( train_text_loader, train_audio_loader, train_image_loader, train_target_loader): loss = 0 max_dec_len = torch.max( original_target_len ) # TODO check error : max decoder timesteps for each batch # Transfer tensors to GPU batch_text = batch_text.to(device) log.info("Loaded batch text") batch_audio = batch_audio.to(device) log.info("Loaded batch audio") batch_images = batch_images.to(device) log.info("Loaded batch image") batch_target_indices = batch_target_indices.to(device) log.info("Loaded batch targets") # Setup for forward batch_size = batch_text.size(0) optimizer.zero_grad() log.info("Starting forward pass") # Forward batch_out_distributions, loss = model( batch_text, original_text_lengths, batch_audio, original_audio_lengths, batch_images, original_img_lengths, batch_target_indices, original_target_len, max_dec_len) loss_val = loss.item() # numerical value of loss loss_epoch = loss_epoch + loss_val log.info("Starting backward") # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) # To tackle exploding gradients optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) # TODO # scores, results = evaluate(model, dev_loader, device, # args.dev_eval_file, # args.max_ans_len, # args.use_squad_v2) saver.save(step, model, device) ema.resume(model) # Generate summary print('Generated summary for iteration {}: '.format(epoch)) summaries = get_generated_summaries(batch_out_distributions, original_text_lengths, batch_source_paths) print(summaries) # Evaluation # rouge = Rouge() # rouge_scores = rouge.get_scores(batch_source_paths, batch_target_paths, avg=True) # print('Rouge score at iteration {} is {}: '.format(epoch, rouge_scores)) # Generate Output Heatmaps # sns.set() # for idx in range(len(out_distributions)): # out_distributions[idx] = out_distributions[idx].squeeze(0).detach().numpy() # Converting each timestep distribution to numpy array # out_distributions = np.asarray(out_distributions) # Converting the timestep list to array # ax = sns.heatmap(out_distributions) # fig = ax.get_figure() # fig.savefig(out_heatmaps_dir + str(epoch) + '.png') print("Epoch loss is : {}".format(loss_epoch / count_item))
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(vectors=(word_vectors, char_vectors), hidden_size=args.hidden_size, drop_prob=args.drop_prob, p_sdd=args.p_sdd, char_limit=args.char_limit, use_transformer=args.use_transformer, inter_size=args.inter_size, heads=args.heads, c2w_size=args.c2w_size, enc_blocks=args.enc_blocks, enc_convs=args.enc_convs, mod_blocks=args.mod_blocks, mod_convs=args.mod_convs, use_GRU=args.use_GRU) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) # uses the saved step num else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # optimizer = optim.Adadelta(model.parameters(), args.lr, # weight_decay=args.l2_wd) # The scheduler MULTIPLIES the base LR, NOT replaces optimizer = optim.Adam(model.parameters(), 1., betas=(.9, .98), eps=1e-9, weight_decay=args.l2_wd) scheduler = sched.LambdaLR( optimizer, lambda s: 0.001 * math.log(s + 1) / math.log(1000 - 1) if s < 1000 else 0.001) # Chute (must use math.log, else TypeError) # scheduler = sched.LambdaLR(optimizer, lambda s: (args.hidden_size**(-.5)) * # min((s+1e-9)**(-.5), s*(4000**(-1.5))) # ) # From Vaswani et. al 2017 # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward optimizer.zero_grad() batch_size = cw_idxs.size(0) cc_idxs = cc_idxs.to(device) # (batch, c_limit, char_limit) qc_idxs = qc_idxs.to(device) cw_idxs = cw_idxs.to(device) # (batch, c_limit) qw_idxs = qw_idxs.to(device) c_idxs, q_idxs = (cw_idxs, cc_idxs), (qw_idxs, qc_idxs) # Forward log_p1, log_p2 = model(c_idxs, q_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step( step // batch_size ) # By default, schedules per epoch; pass in step # as "epoch" ema(model, step // batch_size) # Log info step += batch_size # Number of examples. Step is usually the number of (mini)-batches progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, type="train") log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get your model log.info('Building model...') model, step = get_model(log, args) model = model.to(device) model.train() #Exponential moving average ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adam(model.parameters(), lr=args.lr, betas=[0.8, 0.999], eps=1e-7, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda step: 1) #get loss computer cri = FocalLoss(alpha=torch.tensor([args.alpha, 1]).to(device), gamma=args.gamma) # Get data loader log.info('Building dataset...') dev_dataset = util.load_dataset(args.dev_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes, train=False) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=util.collate_fn) train_dataset = util.load_dataset(args.train_file, args.PPI_dir, args.PPI_gene_feature_dir, args.PPI_gene_query_dict_dir, args.max_nodes) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=util.collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B, batch_y in train_loader: # Setup for forward batch_a = batch_a.to(device) batch_bio_a = batch_bio_a.to(device) batch_A = batch_A.to(device) batch_bio_b = batch_bio_b.to(device) batch_b = batch_b.to(device) batch_B = batch_B.to(device) batch_y = batch_y.to(device) batch_y = batch_y.long() batch_size = batch_bio_a.size(0) optimizer.zero_grad() # Forward output = model(batch_a, batch_bio_a, batch_A, batch_b, batch_bio_b, batch_B) loss = cri(output, batch_y) #loss = F.nll_loss(output, batch_y) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/Loss', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results = evaluate(model, dev_loader, cri, device) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.5f}' for k, v in results.items()) log.info(f'Dev {results_str}') log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') """ model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) """ ''' model = charBiDAF(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) ''' model = QANet(word_vectors=word_vectors, char_vectors=char_vectors, emb_size=char_vectors.size(1), hidden_size=args.hidden_size, drop_prob=args.drop_prob) print( "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________" ) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() #print("Model status - ", cuda_or_cpu(model)) ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300 train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() #print(torch.cuda.memory_allocated(device=None)) scheduler.step(step // batch_size) ema(model, step // batch_size) #torch.cuda.empty_cache() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') # print('Memory 1: ', torch.cuda.memory_allocated()) ema.assign(model) # print('Memory 2: ', torch.cuda.memory_allocated()) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # NEW : load the tag embeddings pos_vectors = util.torch_from_json(args.pos_emb_file) ner_vectors = util.torch_from_json(args.ner_emb_file) # Add loss if 'loss' in args.name: distance_criterion = DistanceFromAnswerLoss(coefficient=.5, device=device, normalization=True, penalization_type='quadratic', reduction='mean') # Choose model log.info('Building model {}...'.format(args.name)) if 'baseline' in args.name: model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'BiDAF_char': model = BiDAF_char(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_loss'): model = BiDAF_tag(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_unfrozen_loss'): model = BiDAF_tag(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, freeze_tag=False) elif args.name == 'BiDAF_tag_ext': model = BiDAF_tag_ext(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'BiDAF_tag_ext_unfrozen': model = BiDAF_tag_ext(word_vectors=word_vectors, char_vectors=char_vectors, pos_vectors=pos_vectors, ner_vectors=ner_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, freeze_tag=False) elif args.name == 'coattn': model = CoattentionModel(hidden_dim=args.hidden_size, embedding_matrix=word_vectors, train_word_embeddings=False, dropout=0.35, pooling_size=16, number_of_iters=4, number_of_layers=2, device=device) else: raise NameError('No model named ' + args.name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, drop_last=True) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in train_loader: # NEW # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward if 'baseline' in args.name: log_p1, log_p2 = model(cw_idxs, qw_idxs) elif args.name == 'BiDAF_char': # Additional setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs) elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) cw_ems = cw_ems.to(device) cw_tfs = cw_tfs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) qw_ems = qw_ems.to(device) qw_tfs = qw_tfs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs) elif args.name == 'coattn': max_c_len = cw_idxs.size(1) max_q_len = qw_idxs.size(1) c_len = [] q_len = [] for i in range(cw_idxs.size(0)): if len((cw_idxs[i] == 0).nonzero()) != 0: c_len_i = (cw_idxs[i] == 0).nonzero()[0].item() else: c_len_i = cw_idxs.size(1) if len((qw_idxs[i] == 0).nonzero()) != 0: q_len_i = (qw_idxs[i] == 0).nonzero()[0].item() else: q_len_i = qw_idxs.size(1) c_len.append(int(c_len_i)) q_len.append(int(q_len_i)) c_len = torch.Tensor(c_len).int() q_len = torch.Tensor(q_len).int() num_examples = int(cw_idxs.size(0) / len(args.gpu_ids)) log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, True) else: raise NameError('No model named ' + args.name) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) # Add distance penalization if 'loss' in args.name: loss += distance_criterion(log_p1, y1) + distance_criterion(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args.name, args.gpu_ids) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model, step = get_model(word_vectors, char_vectors, log, args) model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler #optimizer = get_opt(model,args) #optimizer = optim.Adam(model.parameters(),lr=1,betas=[0.8,0.999],eps=1e-7,weight_decay=args.l2_wd) #warmup=args.warmup # cooldown=args.hidden_size*100 #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5) if step<=cooldown else max(1e-4 ,1e-3*(1-min(1,(step-cooldown)/cooldown))**0.5)) #scheduler=sched.LambdaLR(optimizer,lambda step: min(0.001,0.001*(step/warmup)**0.5)) ''' cr = 1.0 / math.log(warmup) scheduler = sched.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warmup else 1) ''' optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda step: 1) # Get data loader log.info('Building dataset...') if (args.model_name == "KnowGQA"): train_dataset = SQUADwithGraph(args.train_record_file_graph, args.use_squad_v2) dev_dataset = SQUADwithGraph(args.dev_record_file_graph, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn_graph) dev_loader = data.DataLoader(dev_dataset, batch_size=4, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn_graph) else: train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: if (args.model_name == "KnowGQA"): for cw_idxs, cc_idxs, qw_idxs, qc_idxs, co_idxs, adjs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) co_idxs = co_idxs.to(device) adjs = adjs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, co_idxs, adjs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "KnowGQA", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) elif args.model_name == "BiDAF_nochar": for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "BiDAF_nochar", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) else: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, "BiDAF", dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # =========================================================== torch.cuda.empty_cache() # =========================================================== # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') # ============================================================================== # char_vectors = util.torch_from_json(args.char_emb_file) # char_vocab_size, _ = tuple(char_vectors.size()) char_vocab_size = 1376 # 节省内存 model = QANet( word_vectors=word_vectors, char_vocab_size=char_vocab_size, char_dim=args.char_dim, d_model=args.d_model, drop_prob=args.drop_prob, num_heads=args.num_heads, num_mod_blocks=args. num_mod_blocks, # 节省内存============================= maximum_context_length=args.maximum_context_length) # ============================================================================== model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler # optimizer = optim.Adadelta(model.parameters(), # args.lr, # eps=1e-6, # weight_decay=args.l2_wd) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # QANet-02================================================================================================================================ # optimizer = optim.Adam(lr=1, # betas=(args.beta1, args.beta2), # eps=args.adam_eps, # weight_decay=args.l2_wd, # params=model.parameters()) # cr = args.lr / math.log2(args.warm_up) # scheduler = optim.lr_scheduler.LambdaLR( # optimizer, # lr_lambda=lambda ee: cr * math.log2(ee + 1) # if ee < args.warm_up else args.lr) # ======================================================================================================================================== # QANet-03-05============================================================================================================================= # optimizer = optim.Adam(lr=1, # betas=(args.beta1, args.beta2), # eps=args.adam_eps, # weight_decay=args.l2_wd, # params=model.parameters()) # cr = args.lr / math.log2(args.warm_up) # scheduler = optim.lr_scheduler.LambdaLR( # optimizer, # lr_lambda=lambda ee: cr * math.log2(ee + 1) # if ee < args.warm_up else args.lr) # ======================================================================================================================================== # QANet-06 =============================================================================================================================== # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, eps=1e-6, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # ======================================================================================================================================== # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # 08/09 可见作者已经贴心地预留了cc_idxs... # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward # 8/12 当seq_len > max_context_length = 400 时raise ValueError 并跳过 try: log_p1, log_p2 = model( cw_idxs, cc_idxs, qw_idxs, qc_idxs) # 08/09 增加cc_idxs 和 qc_idxs except ValueError: continue y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices startime = datetime.now() args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) time_log = args.log_time if time_log > 0: log.info(f'Start training at: {startime.strftime("%H:%M:%S")}') tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) model_type = args.model # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # check this #useCharEmbeddings = args.model == 'BiDAFplus' # Get embeddings log.info('Loading embeddings...') print(f'{args.word_emb_file}') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) if time_log > 0: log.info(f'Loaded embeddings: {(datetime.now()-startime).seconds}') # load_char_vectors # Get model log.info('Building model...') if model_type == 'BiDAFplus': # model = BiDAFplus(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) elif model_type == 'BiDAFbase': model = BiDAFbase(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif model_type == "Transformer": model = TransformerModel(word_vectors=word_vectors, char_vectors=char_vectors, params=get_params(model_type, args.params)) elif model_type == 'BiDAF': model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, params=get_params(model_type, args.params)) model = nn.DataParallel(model, args.gpu_ids) if time_log > 0: log.info(f'Built model: {(datetime.now()-startime).seconds}') if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') if args.mode != 'quick_eval': train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) else: loaded_data = quick_eval_data_loader() train_loader = [loaded_data for _ in range(5)] dev_loader = [quick_eval_data_loader(dev=True)] train_dataset = train_loader dev_dataset = dev_loader log.info('Built dataset: {}:{}'.format(*divmod((datetime.now() - startime).seconds, 60))) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) if time_log > 0: traintime = datetime.now() total_iterations = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') if time_log > 0: epochtime = datetime.now() if args.mode != 'quick_eval': progress_len = len(train_loader.dataset) else: progress_len = len(train_loader) with torch.enable_grad(), \ tqdm(total=progress_len) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: #quick_eval_data_saver(cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids) ######### if time_log > 0: itertime = datetime.now() # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() if model_type == 'BiDAF' or model_type == "Transformer": cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) # Forward elif model_type == 'BiDAFbase': log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() if time_log > 2: forwardtime = datetime.now() log.info('Forward time {}:{}'.format( *divmod((forwardtime - itertime).seconds, 60))) # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) if time_log > 2: backwardtime = datetime.now() log.info('Backward time {}:{}'.format( *divmod((backwardtime - forwardtime).seconds, 60))) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) if time_log > 0: enditertime = datetime.now() #log.info('Iteration {} {}:{}'.format(total_iterations, # *divmod((enditertime-itertime).seconds, 60))) steps_till_eval -= batch_size if steps_till_eval <= 0 or args.mode == 'quick_eval': steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate( model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, model_type, quick_eval=args.mode == 'quick_eval') saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console if time_log > 1: log.info('Eval time {}:{}'.format( *divmod((datetime.now() - enditertime).seconds, 60))) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) total_iterations += 1 if ((time_log == 2) and (total_iterations % 10 == 0)) or ( (time_log == 1) and (total_iterations % 100 == 0)): log.info('Mean iteration time {}:{}'.format( *divmod((enditertime - traintime).seconds / total_iterations, 60))) if time_log > 0: endepochtime = datetime.now() log.info('Epoch time {}:{}'.format( *divmod((endepochtime - epochtime).seconds, 60)))
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.model_name == 'sketchy': model = SketchyReader(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, char_embed_drop_prob=args.char_embed_drop_prob, num_heads=args.num_heads, drop_prob=args.drop_prob) # SKETCHY elif args.model_name == 'intensive': model = IntensiveReader(word_vectors=word_vectors, char_vectors=char_vectors, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, hidden_size=args.hidden_size, drop_prob=args.drop_prob) # INTENSIVE elif args.model_name == 'retro': model = RetroQANet(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, num_heads=args.num_heads, char_embed_drop_prob=args.char_embed_drop_prob, intensive_path=args.load_path_i, sketchy_path=args.load_path_s, gpu_ids=args.gpu_ids, drop_prob=args.drop_prob) # Outer model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # setup losses bceLoss = nn.BCELoss() # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) if args.optim == "adam": optimizer = optim.Adam( model.parameters(), 0.001, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: counter = 0 epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: counter += 1 # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward y1, y2 = y1.to(device), y2.to(device) if args.model_name == 'sketchy': yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = bceLoss(yi, torch.where( y1 == 0, 0, 1).type(torch.FloatTensor)) elif args.model_name == 'intensive': yi, log_p1, log_p2 = model( cw_idxs, qw_idxs, cc_idxs, qc_idxs) # if counter % 100 == 0: #print(torch.max(log_p1.exp(), dim=1)[0]) # $print(torch.max(log_p2.exp(), dim=1)[0]) #weights = torch.ones(log_p1.shape[1]) #weights[0] = 2/(log_p1.shape[1]) #nll_loss = nn.NLLLoss(weight=weights.to(device='cuda:0')) # gt_0 = torch.zeros(yi.shape[0]).to(device) # gt_1 = torch.ones(yi.shape[0]).to(device) loss = args.alpha_1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type( torch.FloatTensor)) + args.alpha_2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)) #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) elif args.model_name == 'retro': log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) else: raise ValueError( 'invalid --model_name, sketchy or intensive required') loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_( model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/' + args.model_name, loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, model_name=args.model_name, a1=args.alpha_1, a2=args.alpha_2) saver.save( step, model, results[args.metric_name], device, model_name=args.model_name) ema.resume(model) # Log to console results_str = ', '.join( f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get size of char vocab with open(args.char2idx_file, 'r') as fh: char_vocab_size = len(json_load(fh)) # Get model log.info('Building model...') model = QANet(word_vectors=word_vectors, hidden_size=args.hidden_size, char_vocab_size = char_vocab_size, char_emb_size = args.char_emb_size, word_char_emb_size = args.word_char_emb_size, drop_prob=args.drop_prob, num_blocks_embd = args.num_blocks_embd, num_conv_embd = args.num_conv_embd, kernel_size = args.kernel_size, num_heads = args.num_heads, num_blocks_model = args.num_blocks_model, num_conv_model = args.num_conv_model, dropout_char = args.dropout_char, dropout_word = args.dropout_word, survival_prob = args.survival_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler #params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=1, betas=(args.beta1, args.beta2), eps=args.adam_eps, weight_decay=args.l2_wd, params=model.parameters()) cr = args.lr / math.log2(args.warm_up) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < args.warm_up else args.lr) # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) torch.autograd.set_detect_anomaly(True) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs=cc_idxs.to(device) qc_idxs=qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings # Args: word_vectors: word vector tensor of dimension [vocab_size * wemb_dim] log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get Model log.info('Building Model...') model = QANet(word_vectors, char_vectors, args.para_limit, args.ques_limit, args.f_model, num_head=args.num_head, train_cemb = (not args.pretrained_char)) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam( params=parameters, lr=args.lr, betas=(args.beta1, args.beta2), eps=1e-8, weight_decay=3e-7) cr = 1.0 / math.log(args.lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < args.lr_warm_up_num else 1) loss_f = torch.nn.CrossEntropyLoss() # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # set device if args.use_gpu and torch.cuda.is_available(): device = torch.device("cuda:{}".format(args.gpu_ids[0])) args.batch_size *= max(1, len(args.gpu_ids)) print(f"device is cuda: gpu_ids = {args.gpu_ids}") else: device = torch.device("cpu") print("device is cpu") # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = NAQANet( device, word_vectors, char_vectors, c_max_len=args.context_limit, q_max_len=args.question_limit, answering_abilities=['passage_span_extraction', 'counting'], max_count=args.max_count ) # doesn't large max_count lead to meaningless probability? if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler lr = args.lr base_lr = 1.0 warm_up = args.lr_warm_up_num params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(lr=base_lr, betas=(args.beta1, args.beta2), eps=1e-7, weight_decay=3e-7, params=params) cr = lr / math.log(warm_up) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log(ee + 1) if ee < warm_up else lr) # Get data loader log.info('Building dataset...') train_dataset = DROP(args.train_record_file) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = DROP(args.dev_record_file) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, \ qw_idxs, qc_idxs, \ start_idxs, end_idxs, \ counts, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) start_idxs = start_idxs.to(device) end_idxs = end_idxs.to(device) counts = counts.to(device) ids = ids.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward output_dict = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, ids, start_idxs, end_idxs, counts) loss = output_dict["loss"] loss = torch.sum(loss, dim=0) / len(args.gpu_ids) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Save the model print("Saving the model ...") torch.save(model.state_dict(), args.model_dir) print("Done!")
def main(data, flags): # Set up logging and devices log_dir = data.logging_dir log = util.get_logger(log_dir, "toy") tbx = SummaryWriter(data.logging_dir) device, data.gpu_ids = util.get_available_devices() log.info('Config: {}'.format(dumps(vars(data), indent=4, sort_keys=True))) data.batch_size *= max(1, len(data.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(data.random_seed)) random.seed(data.random_seed) np.random.seed(data.random_seed) torch.manual_seed(data.random_seed) torch.cuda.manual_seed_all(data.random_seed) if flags[1] == "toy": word_emb_file = data.toy_word_emb_file training_data = data.toy_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.toy_eval_exp3 elif flags[1] == "train": word_emb_file = data.word_emb_file training_data = data.train_record_file_exp3 test_data = data.dev_record_file_exp3 eval_file = data.train_eval_exp3 elif flags[1] == "dev": word_emb_file = data.word_emb_file training_data = data.dev_record_file_exp3 test_data = data.toy_record_file_exp3 eval_file = data.dev_eval_exp3 # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=data.hidden_size, drop_prob=data.drop_prob) model = nn.DataParallel(model, data.gpu_ids) if data.load_path: log.info('Loading checkpoint from {}...'.format(data.load_path)) model, step = util.load_model(model, data.load_path, data.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, data.ema_decay) # Get saver saver = util.CheckpointSaver(data.logging_dir, max_checkpoints=10, metric_name=data.metric_name, maximize_metric=data.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), data.learning_rate, weight_decay=data.learning_weight_decay) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') # np.load(data.toy_record_file_exp3) train_dataset = SQuAD3(training_data, use_v2=True) train_loader = torchdata.DataLoader(train_dataset, batch_size=data.batch_size, shuffle=True, num_workers=data.num_workers, collate_fn=collate_fn) test_dataset = SQuAD3(test_data, use_v2=True) test_loader = torchdata.DataLoader(test_dataset, batch_size=data.batch_size, shuffle=False, num_workers=data.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = data.eval_steps epoch = step // len(test_dataset) while epoch != data.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log.info("cw_idxs length: {}".format(str(len(cw_idxs)))) log.info("qw_idxs length: {}".format(str(len(qw_idxs)))) log.info("cw_idxs size: {}".format(str( sys.getsizeof(cw_idxs)))) log.info("qw_idxs size: {}".format(str( sys.getsizeof(qw_idxs)))) log.info("cw_idxs shape: {}".format(str(cw_idxs.shape))) log.info("qw_idxs shape: {}".format(str(qw_idxs.shape))) log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), data.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('toy/NLL', loss_val, step) tbx.add_scalar('toy/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = data.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, test_loader, device, eval_path=eval_file, max_len=sys.maxsize, use_squad_v2=True) saver.save(step, model, results[data.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=step, split='dev', num_visuals=data.num_visuals)
def main(args): # Set up faulthandler faulthandler.enable() # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model_params = { 'word_vectors': word_vectors, 'char_vectors': char_vectors, 'args': args } model = get_model(args.model, model_params) print('Model size: {:f} MB'.format( sum(p.nelement() * p.element_size() for p in model.parameters()) / (1024 * 1024))) # model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: progress_bar.set_description( 'Batch data_loading finished'.ljust(30)) progress_bar.refresh() # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() progress_bar.set_description( 'Batch initialization finished'.ljust(30)) progress_bar.refresh() # Forward faulthandler.dump_traceback_later(timeout=3) log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch forward finished'.ljust(30)) progress_bar.refresh() y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward faulthandler.dump_traceback_later(timeout=3) loss.backward() faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch backward finished'.ljust(30)) progress_bar.refresh() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() progress_bar.set_description('Optimization finished'.ljust(30)) progress_bar.refresh() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) progress_bar.set_description( 'Evaluation finished'.ljust(30)) progress_bar.refresh() saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): if args.large: args.train_record_file += '_large' args.dev_eval_file += '_large' model_name = "albert-xlarge-v2" else: model_name = "albert-base-v2" if args.xxlarge: args.train_record_file += '_xxlarge' args.dev_eval_file += '_xxlarge' model_name = "albert-xxlarge-v2" # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get model log.info('Building model...') if args.bidaf: char_vectors = util.torch_from_json(args.char_emb_file) if args.model_name == 'albert_highway': model = models.albert_highway(model_name) elif args.model_name == 'albert_lstm_highway': model = models.LSTM_highway(model_name, hidden_size=args.hidden_size) elif args.model_name == 'albert_bidaf': model = models.BiDAF(char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.model_name == 'albert_bidaf2': model = models.BiDAF2(model_name=model_name, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) else: model = AlbertForQuestionAnswering.from_pretrained(args.model_name) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2, args.bidaf) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) dev_dataset = SQuAD(args.dev_eval_file, args.use_squad_v2, args.bidaf) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) with open(args.dev_gold_file) as f: gold_dict = json.load(f) tokenizer = AlbertTokenizer.from_pretrained(model_name) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for batch in train_loader: batch = tuple(t.to(device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], 'start_positions': batch[3], 'end_positions': batch[4], } if args.bidaf: inputs['char_ids'] = batch[6] y1 = batch[3] y2 = batch[4] # Setup for forward batch_size = inputs["input_ids"].size(0) optimizer.zero_grad() # Forward # log_p1, log_p2 = model(**inputs) y1, y2 = y1.to(device), y2.to(device) outputs = model(**inputs) loss = outputs[0] loss = loss.mean() # loss_fct = nn.CrossEntropyLoss() # loss = loss_fct(log_p1, y1) + loss_fct(log_p2, y2) # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(args, model, dev_dataset, dev_loader, gold_dict, tokenizer, device, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed # To make the data generation of every experiment same log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) print(word_vectors.size()) print(char_vectors.size()) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader # (context_idxs(context_len,): Indices of the words in the context., # context_char_idx(context_len, max_word_len): Indices of the characters in the context, # question_idxs(question_len,): Indices of the words in the question, # question_char_idx(question_len, max_word_len): Indices of the characters in the question, # y1:start, -1 if no answer: answer start index, # y2:start, -1 if no answer: answer end index, # id ID of the example) log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward # cw_idxs: Indices of the words in the context # cc_idxs: Indices of the characters in the context # qw_idxs: Indices of the words in the query # qc_idxs: Indices of the characters in teh query cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) # cw_idx with shape(context_len, ) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) # L(theta) = - 1/N * sum(log(P1_yi_1) + log(P2_yi_2)) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices name = "train_exp2" args.save_dir = util.get_save_dir(args.logging_dir, name, training=True) log = get_logger(args.save_dir, name) tbx = SummaryWriter(args.save_dir) device, gpu_ids = util.get_available_devices() log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") args.batch_size *= max(1, len(gpu_ids)) # Set random seed log.info(f"Using random seed {args.random_seed}...") random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # Get embeddings log.info(f"Loading embeddings from {args.word_emb_file}...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, gpu_ids) if args.load_path: log.info(f"Loading checkpoint from {args.load_path}...") model, step = util.load_model(model, args.load_path, gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.learning_rate, weight_decay=args.learning_rate_decay) # scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR scheduler = sched.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.1, patience=2, verbose=True, cooldown=0 min_lr=0.0005) for epoch in range(args.num_epochs): log.info(f"Starting epoch {epoch}...") for i in range(args.num_train_chunks): # Get data loader train_rec_file = f"{args.train_record_file_exp2}_{i}.npz" log.info(f'Building dataset from {train_rec_file} ...') train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = 0 # torch.set_num_threads(7) with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = qw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f"Evaluating at step {step}...") ema.assign(model) for i in range(args.num_dev_chunks): # Get data loader all_pred_dicts = {} all_results = OrderedDict() dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz" log.info(f'Building evaluating dataset from {dev_rec_file} ...') dev_dataset = SQuAD(dev_rec_file, args.exp2_dev_topic_contexts, use_v2=True) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, use_squad_v2=True) all_results.update(results) all_pred_dicts.update(pred_dict) del dev_dataset del dev_loader del results del pred_dict torch.cuda.empty_cache() saver.save(step, model, all_results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items()) log.info(f"Dev {results_str}") # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in all_results.items(): tbx.add_scalar(f"dev/{k}", v, step) util.visualize(tbx, pred_dict=all_pred_dicts, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals) torch.cuda.empty_cache() del train_dataset del train_loader torch.cuda.empty_cache()
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) max_len = 10 # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ch_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, ch_vectors=ch_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) #print("ckpt 1") ans_lens = y2 - y1 loss = 0 for i in range(max_len): mask = ((torch.ones_like(y1) * i) == ans_lens).type( torch.cuda.LongTensor) y = y1 * mask loss += F.nll_loss(log_p[:, :, i], y) #print("ckpt 2") loss_val = loss.item() #print("ckpt 3") # Backward loss.backward() #print("ckpt 4") nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) #print("ckpt 5") optimizer.step() #print("ckpt 6") scheduler.step(step // batch_size) ema(model, step // batch_size) #print("ckpt 7") # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) if step % (50 * batch_size) == 0: print(loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size #print("ckpt 8") if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) # Writes entries directly to event files in the logdir to be consumed by TensorBoard. device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # 一个gpu: batch_size=64 看实际情况 # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: # default=None log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # ema_decay = 0.999 # ema core => new_average = (1.0 - decay) * param.data + decay * self.shadow[name] # Get saver # metric_name: NLL or EM F1 saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, # max_checkpoints = 5 metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) # lr : default=0.5 l2_wd : default=0 scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) # train_record_file = './data/train.npz' train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, # 64 shuffle=True, # sampler = RandomSampler(dataset) batch_sampler = BatchSampler(sampler, batch_size, drop_last) num_workers=args.num_workers, # 4 collate_fn=collate_fn) # merges a list of samples to form a mini-batch. dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) # dev_record_file = './data/dev.npz' dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)# Merge examples of different length by padding all examples to the maximum length in the batch. # Train log.info('Training...') steps_till_eval = args.eval_steps # 50000 epoch = step // len(train_dataset) # len(train_dataset)= 7 epoch=0 while epoch != args.num_epochs: # 30 epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # 64 optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) # max_grad_norm : default=5.0 optimizer.step() # 进行1次optimize scheduler.step(step // batch_size)# train : step=0 ema(model, step // batch_size) # def __call__(self, model, num_updates): # Log info step += batch_size #step: 0 batch_size=64 progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) # Add scalar data to summary. tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # 50000 # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) ## results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, # './data/dev_eval.json' args.max_ans_len, # 15 args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(tryc): try: os.mkdir("./output/" + sys.argv[2]) except: print("mkdir error") try: os.mkdir("./output/" + sys.argv[2] + "/" + str(tryc)) except: print("mkdir error") operations = [] for i in range(npos): layer = L.Convolution2D(1, ncand, ksize=7, stride=2) #,nobias=True) layer.to_gpu() layer.disable_update() #layer.W.data += 0.9 operations.append(layer) second = [] for i in range(npos): layer1 = L.Convolution2D(1, 1, ksize=4, stride=1, nobias=True) #layer0 = L.BatchNormalization(1) #layer0.to_gpu() #layer0.disable_update() #layer1 = L.Linear(16,1) layer1.to_gpu() layer1.disable_update() second.append([layer1]) a = chainer.Parameter( xp.random.rand(npos * ncand).reshape((npos, ncand)).astype(xp.float32)) teacher = F.softmax(a / ZERO, axis=1) f = open("./output/" + sys.argv[2] + "/" + str(tryc) + "/teacher", "wb") pickle.dump(teacher, f) pickle.dump(operations, f) pickle.dump(second, f) f.close() for q in range(len(modes)): mode = modes[q] mode2 = modes2[q] start = None losses = [] timer = [] orders = [] if BINOMIAL: options = {'zero_pos': None, 'prior': 'unimodal'} else: options = {'zero_pos': None, 'prior': 'uniform'} param = util.Param(mode=mode, shape=(npos, ncand), max_iter=niter, options=options, mode2=mode2) ############## """ # set prior if mode == "BI": ps = [] for k in range(ncand): n = ncand-1 s = float(F.sigmoid(param.param[0]).data) p = (math.factorial(n)/(math.factorial(k)*math.factorial(n-k))) p *= s**k p *= (1-s) ** (n-k) ps.append(p) else: param.param.data = set_binomial_prior(param.param.data) ps = F.softmax(param.param, axis=1).data[0] ps = cuda.to_cpu(ps).tolist() #ps.extend([0,0,0,0,0,0,0,0,0,0]) print(len(ps)) plt.bar(np.arange(ncand), ps, alpha=0.5) plt.show() exit() """ baseline = util.EMA(coef=0.05) #acct = EMA(coef=0.2) #acct = MA(steps=10) for i in range(niter): if i % 10 == 0: #print(param.param) if start is None: timer.append(0.) else: timer.append(time.time() - start) x = batch_sample(100) t = forward(x, teacher, operations, second, mode) a = param.deter() #order = xp.argmax(a.data) #print(order) #orders.append(order) y = forward(x, a, operations, second, mode) #loss = float(F.mean_squared_error(y,t).data) * scale loss = float(toy_lossfunc(y, t).data) losses.append(loss) start = time.time() print(tryc, mode, i, losses[-1], timer[-1]) #print(param.param) lrcoef = (math.cos(i * math.pi / niter) + 1.0) / 2.0 param.optimizer.alpha = 0.001 # * lrcoef param.optimizer.beta1 = 0.9 x = batch_sample(batch_size) t = forward(x, teacher, operations, second, mode) a = param.draw() y = forward(x, a, operations, second, mode) #loss = F.mean_squared_error(y,t) * scale loss = toy_lossfunc(y, t) loss.backward(retain_grad=True) baseline(float(loss.data)) #hist.append(baseline.get()) if mode == "R": param.update([float(loss.data) - baseline.get()]) elif mode == "PA": param.update([float(loss.data)]) else: param.update([float(loss.data) - baseline.get()]) f = open( "./output/" + sys.argv[2] + "/" + str(tryc) + '/' + mode + "_" + mode2, 'wb') pickle.dump(losses, f) pickle.dump(timer, f) pickle.dump(orders, f) pickle.dump(param, f) f.close()
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size if step % 1000 == 0 and step > 0: log.info(f'Step {step}: training loss {loss_val}...') steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vec = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') if args.name == 'baseline': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings': model = BiDAFChar(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'charembeddings2': model = BiDAFChar2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, hidden_size=args.hidden_size, drop_prob=args.drop_prob) elif args.name == 'qanet': model = QANet(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, total_prob=args.total_drop, final_prob=args.final_prob) elif args.name == 'qanet2': model = QANet2(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet3': model = QANet3(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) elif args.name == 'qanet4': model = QANet4(word_vectors=word_vectors, char_vec=char_vec, word_len=16, emb_size=args.hidden_size, drop_prob=args.drop_prob, enc_size=args.enc_size, n_head=args.n_head, LN_train=args.ln_train, DP_residual=args.dp_res, mask_pos=args.mask_pos, two_pos=args.two_pos, rel=args.rel_att, total_prob=args.total_drop, final_prob=args.final_prob, freeze=args.freeze_emb) else: raise ValueError('Wrong model name') model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler if args.name == 'qanet': optimizer = optim.Adam(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adam': if args.grad_cent: optimizer = AdamWGC(model.parameters(), args.lr, betas=(0.9, 0.999), weight_decay=3 * 1e-7, eps=1e-7, use_gc=True) else: optimizer = AdamW(model.parameters(), args.lr, betas=(0.8, 0.999), weight_decay=3 * 1e-7, eps=1e-7) scheduler = warmup(optimizer, 1, 2000) elif args.opt == 'adadelta': optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR elif args.opt == 'sgd': optimizer = optim.SGD(model.parameters(), args.lr, weight_decay=3 * 1e-7) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) i = 0 while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() i += 1 loss /= args.acc_step # Backward loss.backward() if i % args.acc_step == 0: nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(i // (args.acc_step)) ema(model, i // (args.acc_step)) optimizer.zero_grad() # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0 and i % args.acc_step == 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): print("in main") print("args: ", args) if True: args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info('Using random seed {}...'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') # CHECK IF WE NEED TO USE ALL OF THESE???? word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info('Loading checkpoint from {}...'.format(args.load_path)) model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) print("train dataset!: ", train_dataset) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info('Starting epoch {}...'.format(epoch)) with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info('Evaluating at step {}...'.format(step)) ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('Dev {}'.format(results_str)) # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar('dev/{}'.format(k), v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) if args.char_emb: char_vectors = util.torch_from_json(args.char_emb_file) else: char_vectors = None # Get model log.info('Building model...') # model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size, # drop_prob=args.drop_prob) if args.model == 'attentive_reader': model = AttentiveReaderModel(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, rnn_layers=args.rnn_layers, use_gru=args.use_gru) elif args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob, use_gru=args.use_gru, share_rnn=args.share_rnn, highway=args.highway, share_proj=args.share_proj) elif args.model == 'bidaf_stanford': model = BiDAFStanford(word_vectors=word_vectors, hidden_size=args.hidden_size, drop_prob=args.drop_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.l2_wd) elif args.optim == 'adamw': optimizer = optim.AdamW(model.parameters(), args.lr, weight_decay=args.l2_wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) if args.lr_sched == 'plateau': scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR else: # if args.lr_sched == 'const': scheduler = sched.ReduceLROnPlateau(optimizer) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) log.info(' Loaded raw SQuAD train dataset...') train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) log.info(' Built train data loader...') dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) log.info(' Loaded raw SQuAD dev dataset...') dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) log.info(' Built dev data loader...') # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) if args.char_emb: cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward if args.char_emb: log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2, args.char_emb) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)