def main(): # load settings for training parser = argparse.ArgumentParser( description='predict.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) config.preprocess_opts(parser) config.model_opts(parser) config.train_opts(parser) config.predict_opts(parser) config.transformer_opts(parser) opt = parser.parse_args() if opt.seed > 0: torch.manual_seed(opt.seed) # print(opt.gpuid) if torch.cuda.is_available() and not opt.gpuid: opt.gpuid = 0 opt.exp = 'predict.' + opt.exp if hasattr(opt, 'copy_model') and opt.copy_model: opt.exp += '.copy' if hasattr(opt, 'bidirectional'): if opt.bidirectional: opt.exp += '.bi-directional' else: opt.exp += '.uni-directional' # fill time into the name if opt.exp_path.find('%s') > 0: opt.exp_path = opt.exp_path % (opt.exp, opt.timemark) opt.pred_path = opt.pred_path % (opt.exp, opt.timemark) if not os.path.exists(opt.exp_path): os.makedirs(opt.exp_path) if not os.path.exists(opt.pred_path): os.makedirs(opt.pred_path) logging = config.init_logging(logger_name=None, log_file=opt.exp_path + '/output.log', stdout=True) try: opt.train_from = 'model/kp20k.ml.copy.bi-directional.20180908-054257/kp20k.ml.copy.bi-directional.epoch=9.batch=2932.model' test_data_loader, word2id, id2word, vocab = load_data_vocab(opt, load_train=False) model = init_model(opt) generator = SequenceGenerator(model,opt, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length, ) evaluate_beam_search(generator, test_data_loader, opt, title='predict', save_path=opt.pred_path + '/[epoch=%d,batch=%d,total_batch=%d]test_result.csv' % (0, 0, 0)) except Exception as e: logging.exception("message")
def load(self): word2id, id2word, vocab = torch.load(self.model_opts.vocab, 'rb') self.model_opts.word2id = word2id self.model_opts.id2word = id2word self.model_opts.vocab = vocab self.model = init_model(self.model_opts) self.generator = SequenceGenerator( self.model, eos_id=self.model_opts.word2id[pykp.io.EOS_WORD], beam_size=self.model_opts.beam_size, max_sequence_length=self.model_opts.max_sent_length)
def evaluate_per_epoch(model, eval_dataloader, opt, epoch): generator = SequenceGenerator( model, opt, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length, ) evaluate_beam_search(generator, eval_dataloader, opt, title='predict', save_path=opt.pred_path + '/epoch=%s' % (epoch))
def main(): opt = config.init_opt(description='predict_keyphrase.py') logger = config.init_logging('predict_keyphrase', opt.exp_path + '/output.log', redirect_to_stdout=False) logger.info('EXP_PATH : ' + opt.exp_path) logger.info('Parameters:') [ logger.info('%s : %s' % (k, str(v))) for k, v in opt.__dict__.items() ] logger.info( '====================== Checking GPU Availability =========================' ) if torch.cuda.is_available(): if isinstance(opt.device_ids, int): opt.device_ids = [opt.device_ids] logger.info('Running on %s! devices=%s' % ('MULTIPLE GPUs' if len(opt.device_ids) > 1 else '1 GPU', str(opt.device_ids))) else: logger.info('Running on CPU!') try: one2one, one2many = generate_dataset() test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets( opt, one2one, one2many) model = init_model(opt) generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length) for testset_name, test_data_loader in zip(['kp20k'], test_data_loaders): logger.info('Evaluating %s' % testset_name) output = predict_beam_search( generator, test_data_loader, opt, title='test_%s' % testset_name, predict_save_path=None ) #opt.pred_path + '/%s_test_result/' % (testset_name)) print(output) except Exception as e: logger.error(e, exc_info=True)
def evaluate_per_epoch(model, eval_dataloader, opt): generator = SequenceGenerator( model, opt, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length, ) model_path = opt.train_from.split('/')[-1] _, epoch, batch, total_batch = re.findall('\d+', model_path) evaluate_beam_search(generator, test_data_loader, opt, title='predict', save_path=opt.pred_path + '/epoch=%s,batch=%s,total_batch=%s' % (epoch, batch, total_batch))
def train_model(model, optimizer_ml, optimizer_rl, criterion, train_data_loader, valid_data_loader, test_data_loader, opt): generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length ) logging.info('====================== Checking GPU Availability =========================') if torch.cuda.is_available(): if isinstance(opt.gpuid, int): opt.gpuid = [opt.gpuid] logging.info('Running on GPU! devices=%s' % str(opt.gpuid)) # model = nn.DataParallel(model, device_ids=opt.gpuid) else: logging.info('Running on CPU!') logging.info('====================== Start Training =========================') checkpoint_names = [] train_ml_history_losses = [] train_rl_history_losses = [] valid_history_losses = [] test_history_losses = [] # best_loss = sys.float_info.max # for normal training/testing loss (likelihood) best_loss = 0.0 # for f-score stop_increasing = 0 train_ml_losses = [] train_rl_losses = [] total_batch = -1 early_stop_flag = False if opt.train_rl: reward_cache = RewardCache(2000) if False: # opt.train_from: state_path = opt.train_from.replace('.model', '.state') logging.info('Loading training state from: %s' % state_path) if os.path.exists(state_path): (epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_ml_history_losses, train_rl_history_losses, valid_history_losses, test_history_losses) = torch.load(open(state_path, 'rb')) opt.start_epoch = epoch for epoch in range(opt.start_epoch, opt.epochs): if early_stop_flag: break progbar = Progbar(logger=logging, title='Training', target=len(train_data_loader), batch_size=train_data_loader.batch_size, total_examples=len(train_data_loader.dataset.examples)) for batch_i, batch in enumerate(train_data_loader): model.train() total_batch += 1 one2many_batch, one2one_batch = batch report_loss = [] # Training if opt.train_ml: loss_ml, decoder_log_probs = train_ml(one2one_batch, model, optimizer_ml, criterion, opt) train_ml_losses.append(loss_ml) report_loss.append(('train_ml_loss', loss_ml)) report_loss.append(('PPL', loss_ml)) # Brief report if batch_i % opt.report_every == 0: brief_report(epoch, batch_i, one2one_batch, loss_ml, decoder_log_probs, opt) # do not apply rl in 0th epoch, need to get a resonable model before that. if opt.train_rl: if epoch >= opt.rl_start_epoch: loss_rl = train_rl(one2many_batch, model, optimizer_rl, generator, opt, reward_cache) else: loss_rl = 0.0 train_rl_losses.append(loss_rl) report_loss.append(('train_rl_loss', loss_rl)) progbar.update(epoch, batch_i, report_loss) # Validate and save checkpoint if (opt.run_valid_every == -1 and batch_i == len(train_data_loader) - 1) or\ (opt.run_valid_every > -1 and total_batch > 1 and total_batch % opt.run_valid_every == 0): logging.info('*' * 50) logging.info('Run validing and testing @Epoch=%d,#(Total batch)=%d' % (epoch, total_batch)) # valid_losses = _valid_error(valid_data_loader, model, criterion, epoch, opt) # valid_history_losses.append(valid_losses) valid_score_dict = evaluate_beam_search(generator, valid_data_loader, opt, title='Validating, epoch=%d, batch=%d, total_batch=%d' % (epoch, batch_i, total_batch), epoch=epoch, save_path=opt.pred_path + '/epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch)) test_score_dict = evaluate_beam_search(generator, test_data_loader, opt, title='Testing, epoch=%d, batch=%d, total_batch=%d' % (epoch, batch_i, total_batch), epoch=epoch, save_path=opt.pred_path + '/epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch)) checkpoint_names.append('epoch=%d-batch=%d-total_batch=%d' % (epoch, batch_i, total_batch)) curve_names = [] scores = [] if opt.train_ml: train_ml_history_losses.append(copy.copy(train_ml_losses)) scores += [train_ml_history_losses] curve_names += ['Training ML Error'] train_ml_losses = [] if opt.train_rl: train_rl_history_losses.append(copy.copy(train_rl_losses)) scores += [train_rl_history_losses] curve_names += ['Training RL Reward'] train_rl_losses = [] valid_history_losses.append(valid_score_dict) test_history_losses.append(test_score_dict) scores += [[result_dict[name] for result_dict in valid_history_losses] for name in opt.report_score_names] curve_names += ['Valid-' + name for name in opt.report_score_names] scores += [[result_dict[name] for result_dict in test_history_losses] for name in opt.report_score_names] curve_names += ['Test-' + name for name in opt.report_score_names] scores = [np.asarray(s) for s in scores] # Plot the learning curve plot_learning_curve(scores=scores, curve_names=curve_names, checkpoint_names=checkpoint_names, title='Training Validation & Test', save_path=opt.exp_path + '/[epoch=%d,batch=%d,total_batch=%d]train_valid_test_curve.png' % (epoch, batch_i, total_batch)) ''' determine if early stop training (whether f-score increased, before is if valid error decreased) ''' valid_loss = np.average(valid_history_losses[-1][opt.report_score_names[0]]) is_best_loss = valid_loss > best_loss rate_of_change = float(valid_loss - best_loss) / float(best_loss) if float(best_loss) > 0 else 0.0 # valid error doesn't increase if rate_of_change <= 0: stop_increasing += 1 else: stop_increasing = 0 if is_best_loss: logging.info('Validation: update best loss (%.4f --> %.4f), rate of change (ROC)=%.2f' % ( best_loss, valid_loss, rate_of_change * 100)) else: logging.info('Validation: best loss is not updated for %d times (%.4f --> %.4f), rate of change (ROC)=%.2f' % ( stop_increasing, best_loss, valid_loss, rate_of_change * 100)) best_loss = max(valid_loss, best_loss) # only store the checkpoints that make better validation performances if total_batch > 1 and (total_batch % opt.save_model_every == 0 or is_best_loss): # epoch >= opt.start_checkpoint_at and # Save the checkpoint logging.info('Saving checkpoint to: %s' % os.path.join(opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d.error=%f' % (opt.exp, epoch, batch_i, total_batch, valid_loss) + '.model')) torch.save( model.state_dict(), open(os.path.join(opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.model'), 'wb') ) torch.save( (epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_ml_history_losses, train_rl_history_losses, valid_history_losses, test_history_losses), open(os.path.join(opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.state'), 'wb') ) if stop_increasing >= opt.early_stop_tolerance: logging.info('Have not increased for %d epoches, early stop training' % stop_increasing) early_stop_flag = True break logging.info('*' * 50)
def train_model(model, optimizer, criterion, train_data_loader, valid_data_loader, test_data_loader, opt): generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length) logging.info( '====================== Checking GPU Availability =========================' ) if torch.cuda.is_available(): if isinstance(opt.gpuid, int): opt.gpuid = [opt.gpuid] logging.info('Running on GPU! devices=%s' % str(opt.gpuid)) # model = nn.DataParallel(model, device_ids=opt.gpuid) else: logging.info('Running on CPU!') logging.info( '====================== Start Training =========================') checkpoint_names = [] train_history_losses = [] valid_history_losses = [] test_history_losses = [] # best_loss = sys.float_info.max # for normal training/testing loss (likelihood) best_loss = 0.0 # for f-score stop_increasing = 0 train_losses = [] total_batch = 0 early_stop_flag = False if opt.train_from: state_path = opt.train_from.replace('.model', '.state') logging.info('Loading training state from: %s' % state_path) if os.path.exists(state_path): (epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_history_losses, valid_history_losses, test_history_losses) = torch.load(open(state_path, 'rb')) opt.start_epoch = epoch for epoch in range(opt.start_epoch, opt.epochs): if early_stop_flag: break progbar = Progbar(title='Training', target=len(train_data_loader), batch_size=train_data_loader.batch_size, total_examples=len(train_data_loader.dataset)) for batch_i, batch in enumerate(train_data_loader): model.train() batch_i += 1 # for the aesthetics of printing total_batch += 1 one2many_batch, one2one_batch = batch src, trg, trg_target, trg_copy_target, src_ext, oov_lists = one2one_batch max_oov_number = max([len(oov) for oov in oov_lists]) print("src size - ", src.size()) print("target size - ", trg.size()) if torch.cuda.is_available(): src = src.cuda() trg = trg.cuda() trg_target = trg_target.cuda() trg_copy_target = trg_copy_target.cuda() src_ext = src_ext.cuda() optimizer.zero_grad() ''' Training with Maximum Likelihood (word-level error) ''' decoder_log_probs, _, _ = model.forward(src, trg, src_ext, oov_lists) # simply average losses of all the predicitons # IMPORTANT, must use logits instead of probs to compute the loss, otherwise it's super super slow at the beginning (grads of probs are small)! start_time = time.time() if not opt.copy_model: ml_loss = criterion( decoder_log_probs.contiguous().view(-1, opt.vocab_size), trg_target.contiguous().view(-1)) else: ml_loss = criterion( decoder_log_probs.contiguous().view( -1, opt.vocab_size + max_oov_number), trg_copy_target.contiguous().view(-1)) ''' Training with Reinforcement Learning (instance-level reward f-score) ''' src_list, trg_list, _, trg_copy_target_list, src_oov_map_list, oov_list, src_str_list, trg_str_list = one2many_batch if torch.cuda.is_available(): src_list = src_list.cuda() src_oov_map_list = src_oov_map_list.cuda() rl_loss = get_loss_rl() start_time = time.time() ml_loss.backward() print("--backward- %s seconds ---" % (time.time() - start_time)) if opt.max_grad_norm > 0: pre_norm = torch.nn.utils.clip_grad_norm( model.parameters(), opt.max_grad_norm) after_norm = (sum([ p.grad.data.norm(2)**2 for p in model.parameters() if p.grad is not None ]))**(1.0 / 2) logging.info('clip grad (%f -> %f)' % (pre_norm, after_norm)) optimizer.step() train_losses.append(ml_loss.data[0]) progbar.update(epoch, batch_i, [('train_loss', ml_loss.data[0]), ('PPL', ml_loss.data[0])]) if batch_i > 1 and batch_i % opt.report_every == 0: logging.info( '====================== %d =========================' % (batch_i)) logging.info('Epoch : %d Minibatch : %d, Loss=%.5f' % (epoch, batch_i, np.mean(ml_loss.data[0]))) sampled_size = 2 logging.info( 'Printing predictions on %d sampled examples by greedy search' % sampled_size) if torch.cuda.is_available(): src = src.data.cpu().numpy() decoder_log_probs = decoder_log_probs.data.cpu().numpy() max_words_pred = decoder_log_probs.argmax(axis=-1) trg_target = trg_target.data.cpu().numpy() trg_copy_target = trg_copy_target.data.cpu().numpy() else: src = src.data.numpy() decoder_log_probs = decoder_log_probs.data.numpy() max_words_pred = decoder_log_probs.argmax(axis=-1) trg_target = trg_target.data.numpy() trg_copy_target = trg_copy_target.data.numpy() sampled_trg_idx = np.random.random_integers(low=0, high=len(trg) - 1, size=sampled_size) src = src[sampled_trg_idx] oov_lists = [oov_lists[i] for i in sampled_trg_idx] max_words_pred = [max_words_pred[i] for i in sampled_trg_idx] decoder_log_probs = decoder_log_probs[sampled_trg_idx] if not opt.copy_model: trg_target = [ trg_target[i] for i in sampled_trg_idx ] # use the real target trg_loss (the starting <BOS> has been removed and contains oov ground-truth) else: trg_target = [trg_copy_target[i] for i in sampled_trg_idx] for i, (src_wi, pred_wi, trg_i, oov_i) in enumerate( zip(src, max_words_pred, trg_target, oov_lists)): nll_prob = -np.sum([ decoder_log_probs[i][l][pred_wi[l]] for l in range(len(trg_i)) ]) find_copy = np.any([x >= opt.vocab_size for x in src_wi]) has_copy = np.any([x >= opt.vocab_size for x in trg_i]) sentence_source = [ opt.id2word[x] if x < opt.vocab_size else oov_i[x - opt.vocab_size] for x in src_wi ] sentence_pred = [ opt.id2word[x] if x < opt.vocab_size else oov_i[x - opt.vocab_size] for x in pred_wi ] sentence_real = [ opt.id2word[x] if x < opt.vocab_size else oov_i[x - opt.vocab_size] for x in trg_i ] sentence_source = sentence_source[:sentence_source.index( '<pad>' )] if '<pad>' in sentence_source else sentence_source sentence_pred = sentence_pred[:sentence_pred.index( '<pad>' )] if '<pad>' in sentence_pred else sentence_pred sentence_real = sentence_real[:sentence_real.index( '<pad>' )] if '<pad>' in sentence_real else sentence_real logging.info( '==================================================') logging.info('Source: %s ' % (' '.join(sentence_source))) logging.info('\t\tPred : %s (%.4f)' % (' '.join(sentence_pred), nll_prob) + (' [FIND COPY]' if find_copy else '')) logging.info('\t\tReal : %s ' % (' '.join(sentence_real)) + (' [HAS COPY]' + str(trg_i) if has_copy else '')) if total_batch > 1 and total_batch % opt.run_valid_every == 0: logging.info('*' * 50) logging.info( 'Run validing and testing @Epoch=%d,#(Total batch)=%d' % (epoch, total_batch)) # valid_losses = _valid_error(valid_data_loader, model, criterion, epoch, opt) # valid_history_losses.append(valid_losses) valid_score_dict = evaluate_beam_search( generator, valid_data_loader, opt, title='valid', epoch=epoch, save_path=opt.exp_path + '/epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch)) test_score_dict = evaluate_beam_search( generator, test_data_loader, opt, title='test', epoch=epoch, save_path=opt.exp_path + '/epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch)) checkpoint_names.append('epoch=%d-batch=%d-total_batch=%d' % (epoch, batch_i, total_batch)) train_history_losses.append(copy.copy(train_losses)) valid_history_losses.append(valid_score_dict) test_history_losses.append(test_score_dict) train_losses = [] scores = [train_history_losses] curve_names = ['Training Error'] scores += [[ result_dict[name] for result_dict in valid_history_losses ] for name in opt.report_score_names] curve_names += [ 'Valid-' + name for name in opt.report_score_names ] scores += [[ result_dict[name] for result_dict in test_history_losses ] for name in opt.report_score_names] curve_names += [ 'Test-' + name for name in opt.report_score_names ] scores = [np.asarray(s) for s in scores] # Plot the learning curve plot_learning_curve( scores=scores, curve_names=curve_names, checkpoint_names=checkpoint_names, title='Training Validation & Test', save_path=opt.exp_path + '/[epoch=%d,batch=%d,total_batch=%d]train_valid_test_curve.png' % (epoch, batch_i, total_batch)) ''' determine if early stop training (whether f-score increased, before is if valid error decreased) ''' valid_loss = np.average( valid_history_losses[-1][opt.report_score_names[0]]) is_best_loss = valid_loss > best_loss rate_of_change = float(valid_loss - best_loss) / float( best_loss) if float(best_loss) > 0 else 0.0 # valid error doesn't increase if rate_of_change <= 0: stop_increasing += 1 else: stop_increasing = 0 if is_best_loss: logging.info( 'Validation: update best loss (%.4f --> %.4f), rate of change (ROC)=%.2f' % (best_loss, valid_loss, rate_of_change * 100)) else: logging.info( 'Validation: best loss is not updated for %d times (%.4f --> %.4f), rate of change (ROC)=%.2f' % (stop_increasing, best_loss, valid_loss, rate_of_change * 100)) best_loss = max(valid_loss, best_loss) # only store the checkpoints that make better validation performances if total_batch > 1 and ( total_batch % opt.save_model_every == 0 or is_best_loss): #epoch >= opt.start_checkpoint_at and # Save the checkpoint logging.info('Saving checkpoint to: %s' % os.path.join( opt.save_path, '%s.epoch=%d.batch=%d.total_batch=%d.error=%f' % (opt.exp, epoch, batch_i, total_batch, valid_loss) + '.model')) torch.save( model.state_dict(), open( os.path.join( opt.save_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.model'), 'wb')) torch.save((epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_history_losses, valid_history_losses, test_history_losses), open( os.path.join( opt.save_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.state'), 'wb')) if stop_increasing >= opt.early_stop_tolerance: logging.info( 'Have not increased for %d epoches, early stop training' % stop_increasing) early_stop_flag = True break logging.info('*' * 50)
def main(): # load settings for training parser = argparse.ArgumentParser( description='predict.py', formatter_class=argparse.ArgumentDefaultsHelpFormatter) config.preprocess_opts(parser) config.model_opts(parser) config.train_opts(parser) config.predict_opts(parser) opt = parser.parse_args() if opt.seed > 0: torch.manual_seed(opt.seed) print(opt.gpuid) if torch.cuda.is_available() and not opt.gpuid: opt.gpuid = 0 opt.exp = 'predict.' + opt.exp if hasattr(opt, 'copy_model') and opt.copy_model: opt.exp += '.copy' if hasattr(opt, 'bidirectional'): if opt.bidirectional: opt.exp += '.bi-directional' else: opt.exp += '.uni-directional' # fill time into the name if opt.exp_path.find('%s') > 0: opt.exp_path = opt.exp_path % (opt.exp, opt.timemark) opt.pred_path = opt.pred_path % (opt.exp, opt.timemark) if not os.path.exists(opt.exp_path): os.makedirs(opt.exp_path) if not os.path.exists(opt.pred_path): os.makedirs(opt.pred_path) logging = config.init_logging('train', opt.exp_path + '/output.log') logging.info('Parameters:') [ logging.info('%s : %s' % (k, str(v))) for k, v in opt.__dict__.items() ] try: train_data_loader, valid_data_loader, test_data_loader, word2id, id2word, vocab = load_data_vocab( opt, load_train=False) model = init_model(opt) # optimizer, criterion = init_optimizer_criterion(model, opt) generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length) # import time # start_time = time.time() evaluate_beam_search( generator, test_data_loader, opt, title='predict', save_path=opt.pred_path + '/[epoch=%d,batch=%d,total_batch=%d]test_result.csv' % (0, 0, 0)) # print("--- %s seconds --- Complete Beam Search" % (time.time() - start_time)) # predict_greedy(model, test_data_loader, test_examples, opt) except Exception as e: logging.exception("message")
def main(): opt = config.init_opt(description='predict.py') logger = config.init_logging('predict', opt.exp_path + '/output.log', redirect_to_stdout=False) logger.info('EXP_PATH : ' + opt.exp_path) logger.info('Parameters:') [ logger.info('%s : %s' % (k, str(v))) for k, v in opt.__dict__.items() ] logger.info( '====================== Checking GPU Availability =========================' ) if torch.cuda.is_available(): if isinstance(opt.device_ids, int): opt.device_ids = [opt.device_ids] logger.info('Running on %s! devices=%s' % ('MULTIPLE GPUs' if len(opt.device_ids) > 1 else '1 GPU', str(opt.device_ids))) else: logger.info('Running on CPU!') try: valid_data_loaders, word2id, id2word, vocab = load_vocab_and_datasets_for_testing( dataset_names=opt.test_dataset_names, type='valid', opt=opt) test_data_loaders, _, _, _ = load_vocab_and_datasets_for_testing( dataset_names=opt.test_dataset_names, type='test', opt=opt) opt.word2id = word2id opt.id2word = id2word opt.vocab = vocab model = init_model(opt) generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length) valid_score_dict = evaluate_multiple_datasets( generator, valid_data_loaders, opt, title='valid', predict_save_path=opt.pred_path) test_score_dict = evaluate_multiple_datasets( generator, test_data_loaders, opt, title='test', predict_save_path=opt.pred_path) # test_data_loaders, word2id, id2word, vocab = load_vocab_and_datasets(opt) # for testset_name, test_data_loader in zip(opt.test_dataset_names, test_data_loaders): # logger.info('Evaluating %s' % testset_name) # evaluate_beam_search(generator, test_data_loader, opt, # title='test_%s' % testset_name, # predict_save_path=opt.pred_path + '/%s_test_result/' % (testset_name)) except Exception as e: logger.error(e, exc_info=True)
def train_model(model, optimizer_ml, optimizer_rl, criterion, train_data_loader, valid_data_loaders, test_data_loaders, opt): generator = SequenceGenerator(model, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length) logger = logging.getLogger('train.py') logger.info( '====================== Checking GPU Availability =========================' ) if torch.cuda.is_available(): if isinstance(opt.gpuid, int): opt.gpuid = [opt.gpuid] logger.info('Running on GPU! devices=%s' % str(opt.gpuid)) # model = nn.DataParallel(model, device_ids=opt.gpuid) model = model.cuda() else: logger.info('Running on CPU!') logger.info( '====================== Start Training =========================' ) checkpoint_names = [] train_ml_history_losses = [] train_rl_history_losses = [] valid_history_scores = {} test_history_scores = {} # best_loss = sys.float_info.max # for normal training/testing loss (likelihood) best_loss = 0.0 # for f-score stop_increasing = 0 train_ml_losses = [] train_rl_losses = [] total_batch = -1 early_stop_flag = False if opt.train_rl: reward_cache = RewardCache(2000) # if False: # opt.train_from: # state_path = opt.train_from.replace('.model', '.state') # logger.info('Loading training state from: %s' % state_path) # if os.path.exists(state_path): # (epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_ml_history_losses, train_rl_history_losses, valid_history_scores, # test_history_losses) = torch.load(open(state_path, 'rb')) # opt.start_epoch = epoch for epoch in range(opt.start_epoch, opt.epochs): if early_stop_flag: break progbar = Progbar(logger=logger, title='Training', target=len(train_data_loader), batch_size=train_data_loader.batch_size, total_examples=len(train_data_loader.dataset)) for batch_i, batch in enumerate(train_data_loader): model.train() total_batch += 1 one2many_batch, one2one_batch = batch report_loss = [] # Training if opt.train_ml: loss_ml, decoder_log_probs = train_ml(one2one_batch, model, optimizer_ml, criterion, opt) # len(decoder_log_probs) == 0 if encountered OOM if len(decoder_log_probs) == 0: continue train_ml_losses.append(loss_ml) report_loss.append(('train_ml_loss', loss_ml)) report_loss.append(('PPL', loss_ml)) # Brief report if batch_i % opt.report_every == 0: brief_report(epoch, batch_i, one2one_batch, loss_ml, decoder_log_probs, opt) # do not apply rl in 0th epoch, need to get a resonable model before that. if opt.train_rl: if epoch >= opt.rl_start_epoch: loss_rl = train_rl(one2many_batch, model, optimizer_rl, generator, opt, reward_cache) else: loss_rl = 0.0 train_rl_losses.append(loss_rl) report_loss.append(('train_rl_loss', loss_rl)) progbar.update(epoch, batch_i, report_loss) ''' Validate and save checkpoint ''' if (opt.run_valid_every == -1 and batch_i == len(train_data_loader) - 1) or\ (opt.run_valid_every > -1 and total_batch > 1 and total_batch % opt.run_valid_every == 0): logger.info('*' * 50) logger.info( 'Run validing and testing @Epoch=%d,#(Total batch)=%d' % (epoch, total_batch)) # return a dict, key is the dataset name and value is a score dict valid_score_dict = evaluate.evaluate_multiple_datasets( generator, valid_data_loaders, opt, epoch=epoch, title='valid.epoch=%d.total_batch=%d' % (epoch, total_batch), predict_save_path=os.path.join( opt.pred_path, 'epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch))) test_score_dict = evaluate.evaluate_multiple_datasets( generator, test_data_loaders, opt, epoch=epoch, title='test.epoch=%d.total_batch=%d' % (epoch, total_batch), predict_save_path=os.path.join( opt.pred_path, 'epoch%d_batch%d_total_batch%d' % (epoch, batch_i, total_batch))) ''' Merge scores of current round into history_score ''' for dataset_name, score_dict in valid_score_dict.items(): # each history_loss is a dict, specific to a dataset # key is score name and value is a list, each element is a list of scores (e.g. f1_score) of all examples valid_history_score = valid_history_scores.get( dataset_name, {}) for score_name, score_values in score_dict.items(): history_score_values = valid_history_score.get( score_name, []) history_score_values.append(score_values) valid_history_score[score_name] = history_score_values valid_history_scores[dataset_name] = valid_history_score for dataset_name, score_dict in test_score_dict.items(): test_history_score = test_history_scores.get( dataset_name, {}) for score_name, score_values in score_dict.items(): history_score_values = test_history_score.get( score_name, []) history_score_values.append(score_values) test_history_score[score_name] = history_score_values test_history_scores[dataset_name] = test_history_score if opt.train_ml: train_ml_history_losses.append(copy.copy(train_ml_losses)) train_ml_losses = [] if opt.train_rl: train_rl_history_losses.append(copy.copy(train_rl_losses)) train_rl_losses = [] ''' Iterate each dataset (including a merged 'all_datasets') and plot learning curves ''' for dataset_name in opt.test_dataset_names + ['all_datasets']: valid_history_score = valid_history_scores[dataset_name] test_history_score = test_history_scores[dataset_name] curve_names = [] scores_for_plot = [] if opt.train_ml: scores_for_plot += [train_ml_history_losses] curve_names += ['Training ML Error'] if opt.train_rl: scores_for_plot += [train_rl_history_losses] curve_names += ['Training RL Reward'] scores_for_plot += [ valid_history_score[name] for name in opt.report_score_names ] curve_names += [ 'Valid-' + name for name in opt.report_score_names ] scores_for_plot += [ test_history_score[name] for name in opt.report_score_names ] curve_names += [ 'Test-' + name for name in opt.report_score_names ] scores_for_plot = [np.asarray(s) for s in scores_for_plot] ''' Plot the learning curve ''' plot_learning_curve_and_write_csv( scores=scores_for_plot, curve_names=curve_names, checkpoint_names=checkpoint_names, title='Training Validation & Test of %s' % dataset_name, save_path=opt.plot_path + '/[epoch=%d,batch=%d,total_batch=%d].%s.learning_curve' % (epoch, batch_i, total_batch, dataset_name)) ''' determine if early stop training (whether f-score increased, previously is if valid error decreased) opt.report_score_names[0] is 'f_score@5_exact' ''' valid_loss = np.average(valid_history_scores['all_datasets'][ opt.report_score_names[0]][-1]) is_best_loss = valid_loss > best_loss rate_of_change = float(valid_loss - best_loss) / float( best_loss) if float(best_loss) > 0 else 0.0 # valid error doesn't increase if rate_of_change <= 0: stop_increasing += 1 else: stop_increasing = 0 if is_best_loss: logging.info( 'Validation: update best loss (%.4f --> %.4f), rate of change (ROC)=%.2f' % (best_loss, valid_loss, rate_of_change * 100)) else: logging.info( 'Validation: best loss is not updated for %d times (%.4f --> %.4f), rate of change (ROC)=%.2f' % (stop_increasing, best_loss, valid_loss, rate_of_change * 100)) logging.info( 'Current test loss (over %d datasets): %s\n' % (len(opt.test_dataset_names), str(opt.test_dataset_names))) for report_score_name in opt.report_score_names: test_loss = np.average(test_history_scores['all_datasets'] [report_score_name][-1]) logging.info('\t\t %s = %.4f' % (report_score_name, test_loss)) best_loss = max(valid_loss, best_loss) ''' Save checkpoints, only store the ones that make better validation performances ''' checkpoint_names.append('epoch=%d-batch=%d-total_batch=%d' % (epoch, batch_i, total_batch)) if total_batch > 1 and ( total_batch % opt.save_model_every == 0 or is_best_loss): # epoch >= opt.start_checkpoint_at and # Save the checkpoint logging.info('Saving checkpoint to: %s' % os.path.join( opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d.error=%f' % (opt.exp, epoch, batch_i, total_batch, valid_loss) + '.model')) torch.save( model.state_dict(), open( os.path.join( opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.model'), 'wb')) torch.save((epoch, total_batch, best_loss, stop_increasing, checkpoint_names, train_ml_history_losses, train_rl_history_losses, valid_history_scores, test_history_scores), open( os.path.join( opt.model_path, '%s.epoch=%d.batch=%d.total_batch=%d' % (opt.exp, epoch, batch_i, total_batch) + '.state'), 'wb')) if stop_increasing >= opt.early_stop_tolerance: logging.info( 'Have not increased for %d epoches, early stop training' % stop_increasing) early_stop_flag = True break logging.info('*' * 50)
def main(): opt = config.init_opt(description='predict.py') opt.data = 'data3/kp20k/kp20k' opt.vocab = 'data3/kp20k/kp20k.vocab.pt' #opt.train_from = 'exp/kp20k.ml.copy.20181129-193506/model/kp20k.ml.copy.epoch=1.batch=20000.total_batch=20000.model' opt.train_from = 'exp/kp20k.ml.copy.20181128-153121/model/kp20k.ml.copy.epoch=2.batch=15495.total_batch=38000.model' opt.useGpu = 0 opt.encoder_type = 'rnn' opt.useCLF = False if opt.encoder_type.startswith('transformer'): opt.batch_size = 32 opt.d_inner = 2048 opt.enc_n_layers = 4 opt.dec_n_layers = 2 opt.n_head = 8 opt.d_k = 64 opt.d_v = 64 opt.d_model = 512 opt.word_vec_size = 512 opt.run_valid_every = 5000000 opt.save_model_every = 20000 opt.decode_old = True # opt.copy_attention = False elif opt.encoder_type.startswith('bert'): opt.useOnlyTwo = False opt.avgHidden = True opt.useZeroDecodeHidden = False opt.useSameEmbeding = False opt.batch_size = 10 opt.max_sent_length = 10 opt.run_valid_every = 20000 opt.decode_old = False opt.beam_search_batch_size = 10 opt.bert_model = 'bert-base-uncased' opt.tokenizer = BertTokenizer.from_pretrained(opt.bert_model) if opt.encoder_type == 'bert_low': opt.copy_attention = False else: opt.enc_layers = 2 opt.bidirectional = True opt.decode_old = True logger = config.init_logging('predict', opt.exp_path + '/output.log', redirect_to_stdout=False) logger.info('EXP_PATH : ' + opt.exp_path) logger.info('Parameters:') [ logger.info('%s : %s' % (k, str(v))) for k, v in opt.__dict__.items() ] logger.info( '====================== Checking GPU Availability =========================' ) if torch.cuda.is_available() and opt.useGpu: if isinstance(opt.gpuid, int): opt.gpuid = [opt.gpuid] logger.info('Running on %s! devices=%s' % ('MULTIPLE GPUs' if len(opt.gpuid) > 1 else '1 GPU', str(opt.gpuid))) else: logger.info('Running on CPU!') try: test_data_loaders, word2id, id2word, vocab = load_vocab_and_testsets( opt) model = init_model(opt) if torch.cuda.is_available() and opt.useGpu: model.cuda() generator = SequenceGenerator(model, opt.word_vec_size if opt.encoder_type == 'transformer' else opt.vocab_size, eos_id=opt.word2id[pykp.io.EOS_WORD], beam_size=opt.beam_size, max_sequence_length=opt.max_sent_length, useGpu=opt.useGpu) for testset_name, test_data_loader in zip(opt.test_dataset_names, test_data_loaders): logger.info('Evaluating %s' % testset_name) evaluate_beam_search(generator, test_data_loader, opt, title='test_%s' % testset_name, predict_save_path=opt.pred_path + '/%s_test_result/' % (testset_name)) except Exception as e: logger.error(e, exc_info=True)