def callback(_locals, _globals): nonlocal n_callbacks, best_ret model = _locals['self'] n_callbacks += 1 #total_steps = model.num_timesteps + (timesteps)*num_trains total_steps = n_callbacks * model.n_steps print("total steps: ", total_steps) # Saving best model if (total_steps) % eval_save_period == 0: start_eval_time = time.time() if is_save: ret, std, total_rets, state_history = evaluate(model, eval_env, render=False) #model.save(os.path.join(experiment_name, 'model_{}_{}.pkl'.format(total_steps, ret))) if ret > best_ret: print("Saving new best model") model.save(os.path.join(experiment_name, 'best_model_{}_{}.pkl'.format(total_steps, ret))) best_ret = ret #wandb.log({"eval_ret": ret}, step=total_steps) state_history = list(state_history) line = [total_steps] + state_history with open(rets_path, "a", newline="") as f: writer = csv.writer(f) writer.writerow(line) else: ret, std, total_rets, _ = evaluate(model, eval_env, render=False) return True
def main(_): repeat = 0 maxRepeat = 3 while repeat < maxRepeat: train() print('start evaluation...') evaluate() repeat += 1
def dump_interpret(model_path, full_model, invasive_uniform, eval_bleu, dataset, include_train_subset, grad_bsize, calculate_grad): print('interpreting %s' % model_path) meta_stats = {} training_data, validation_data, vocab = load_dataset_by_name(dataset) pad_id = vocab.PieceToId("<pad>") bos_id = vocab.PieceToId("<s>") eos_id = vocab.PieceToId("</s>") val_data_manager = StateManager(validation_data, vocab, bos_id, eos_id, pad_id, device, model_config) train_data_manager = StateManager(training_data, vocab, bos_id, eos_id, pad_id, device, model_config) VOCAB_SIZE = vocab.GetPieceSize() model = Seq2seq(device=device, hidden_dim=HIDDEN_DIM, vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, dropout=0, attn_lambda=0.0, pad_id=pad_id, full_model=full_model, invasive_uniform=invasive_uniform).to(device) model.load_state_dict(torch.load(model_path)) if not full_model: state_scores_val = get_state_scores(model, val_data_manager) else: state_scores_val = get_state_scores2(model, val_data_manager) if calculate_grad: grad_influence_val = get_grad_influence2(model, val_data_manager, grad_bsize) perplexity_val, acc_val, attn_val = evaluate_next_token(model, val_data_manager) meta_stats['val_acc'] = acc_val meta_stats['val_perplexity'] = perplexity_val if eval_bleu: bleu_val = evaluate(model, val_data_manager, method='beam') meta_stats['val_bleu'] = bleu_val if include_train_subset: random.seed(1) train_idxs = random.sample(range(len(train_data_manager.dataset)), k=len(val_data_manager.dataset)) inverse_train_idx_map = {train_idxs[i]: i for i in range(len(train_idxs))} eval_train = StateManager([train_data_manager.dataset[idx] for idx in train_idxs], vocab, bos_id, eos_id, pad_id, device, model_config) if not full_model: state_scores_train = get_state_scores(model, eval_train) else: state_scores_train = get_state_scores2(model, eval_train) if calculate_grad: grad_influence_train = get_grad_influence2(model, eval_train, grad_bsize) perplexity_train, acc_train, attn_train = evaluate_next_token(model, eval_train) meta_stats['train_acc'] = acc_train meta_stats['train_perplexity'] = perplexity_train if eval_bleu: bleu_train = evaluate(model, eval_train, method='beam') meta_stats['train_bleu'] = bleu_train items = [] for i in range(len(val_data_manager.dataset)): curr_dict = {} curr_dict['split'] = 'val' curr_dict['src'] = sentence2ids_nopad(val_data_manager, val_data_manager.dataset[i].src, additional_eos=False) curr_dict['trg'] = sentence2ids_nopad(val_data_manager, val_data_manager.dataset[i].trg, additional_eos=False) curr_dict['beta'] = state_scores_val[i] curr_dict['alpha'] = attn_val[i] if calculate_grad: curr_dict['grad'] = grad_influence_val[i] else: curr_dict['grad'] = [] items.append(curr_dict) if include_train_subset: train_idxs_set = set(train_idxs) for i in range(len(train_data_manager.dataset)): curr_dict = {} curr_dict['split'] = 'train' curr_dict['src'] = sentence2ids_nopad(train_data_manager, train_data_manager.dataset[i].src, additional_eos=False) curr_dict['trg'] = sentence2ids_nopad(train_data_manager, train_data_manager.dataset[i].trg, additional_eos=False) if i in train_idxs_set: curr_dict['beta'] = state_scores_train[inverse_train_idx_map[i]] curr_dict['alpha'] = attn_train[inverse_train_idx_map[i]] if calculate_grad: curr_dict['grad'] = grad_influence_train[inverse_train_idx_map[i]] else: curr_dict['grad'] = [] else: curr_dict['beta'] = None curr_dict['alpha'] = None curr_dict['grad'] = None items.append(curr_dict) return items, meta_stats
def do_train_exp(): print 'Training mp_lstm Network' print 'Experimental settins:' print 'NUMBER EPOCH: %d' % (CFG['NUM_EPOCH']) print 'BATCH SIZE: %d' % (CFG['BATCH_SIZE']) print 40 * '-' print 40 * '-' print 'build model...' model, exp_func = exp_model() print 'model ok' print 40 * '*' num_samples = len(CFG['TRAIN']) num_batch = 0 #num_samples / CFG['BATCH_SIZE'] best_loss = np.inf acc = [] best_m = 0. best_b = 0. # print 'training...' for iepoch in np.arange(CFG['NUM_EPOCH']): epoch_loss = 0. epoch_acc = 0. epoch_att_loss = 0. epoch_sem_loss = 0. for ibatch in np.arange(num_batch): batch_idx = CFG['TRAIN'][ibatch * CFG['BATCH_SIZE']:(ibatch + 1) * CFG['BATCH_SIZE']] batch_data, batch_words, gt_words, mask, k_words = data.get_batch_data( batch_idx, feat_type='ResNet') predict_words = np.reshape(gt_words, (-1, )) # print batch_data.shape # print batch_words.shape # print mask.shape # print predict_words.shape # print k_words.shape # word2sent([batch_words[0]]) # label2word([k_words[0]]) print 'forward and backward...' # batch_loss,batch_acc,att,att_loss,sem_loss= exp_func['train func'](batch_data,batch_words,mask,predict_words,k_words) try: batch_loss, batch_acc, att, att_loss, sem_loss = exp_func[ 'train func'](batch_data, batch_words, mask, predict_words, k_words) except Exception, e_data: print 'Found Exception' print e_data continue print '%d epoch %d batch: loss %f att loss: %f sem loss: %f acc %f' % ( iepoch + 1, ibatch + 1, batch_loss, att_loss, sem_loss, batch_acc) print 'attention:' # print att.shape print 40 * '*' print 'attention values from all frames at first step' print att[0, 0] print 40 * '*' print 'attention values from all time steps at first frame' print att[0, :, 0] epoch_loss += batch_loss epoch_acc += batch_acc epoch_att_loss += att_loss epoch_sem_loss += sem_loss num_batch += 1 epoch_loss /= num_batch epoch_acc /= num_batch epoch_att_loss /= num_batch epoch_sem_loss /= num_batch train_acc = epoch_acc logfile = open( 'logs/VIDCAP_ATT/log_train_' + time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+') print >> logfile, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time()) ) + '\n epoch %d script train loss:%f att loss %f sem loss %f train acc:%f' % ( iepoch + 1, epoch_loss, epoch_att_loss, epoch_sem_loss, train_acc) logfile.close() print 'mean batch loss: %f' % epoch_loss print 'mean batch att loss: %f' % epoch_att_loss print 'mean batch sem loss: %f' % epoch_sem_loss print 'mean batch acc: %f' % epoch_acc print 40 * '-' if epoch_loss < best_loss: print 'find better training result.' print 'saving model' net_params = lasagne.layers.get_all_param_values( model[model['net name']]['word_prob']) modelfile = open('../models/sta_fg_params.pkl', 'wb') cPickle.dump(net_params, modelfile) modelfile.close() if iepoch >= 0: print 'lets validating our model' eval_res = eval_model.evaluate(exp_func['sent prob'], exp_func['key word'], 'ResNet', 'valid') logfile = open( 'logs/VIDCAP_ATT/log_train_' + time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+') print >> logfile, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime( time.time())) + ' evaluation results\n' if eval_res['METEOR'] > best_m and eval_res['Bleu_4'] > best_b: best_m = eval_res['METEOR'] best_b = eval_res['Bleu_4'] modelfile = open('../models/best_valided_stafg_params.pkl', 'wb') cPickle.dump(net_params, modelfile) modelfile.close() print >> logfile, 'find better model!!!' print >> logfile, 'validation results:' for metric, score in eval_res.items(): print >> logfile, '%s: %.3f' % (metric, score), print >> logfile, '\n' logfile.close()
best_m = eval_res['METEOR'] best_b = eval_res['Bleu_4'] modelfile = open('../models/best_valided_stafg_params.pkl', 'wb') cPickle.dump(net_params, modelfile) modelfile.close() print >> logfile, 'find better model!!!' print >> logfile, 'validation results:' for metric, score in eval_res.items(): print >> logfile, '%s: %.3f' % (metric, score), print >> logfile, '\n' logfile.close() # testing model print 'training done! Testing model...' eval_res = eval_model.evaluate(exp_func['sent prob'], exp_func['key word'], 'ResNet', 'test') logfile = open( 'logs/VIDCAP_ATT/log_test_' + time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+') print >> logfile, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime( time.time())) + ' testing results\n' for metric, score in eval_res.items(): print >> logfile, '%s: %.3f' % (metric, score), print >> logfile, '\n' logfile.close() print 'DONE' def word2sent(wordids): print 'Captions:'
if args.uniform: config_name = 'h_dim=%d,dropout=%f,b_size=%d,seed=%d,uniform' % ( HIDDEN_DIM, DROPOUT, batch_size, seed) else: config_name = 'h_dim=%d,dropout=%f,b_size=%d,seed=%d,normal' % ( HIDDEN_DIM, DROPOUT, batch_size, seed) model_path = ("models/%s/%s/" % (dataset, config_name)) if not os.path.exists(model_path): os.makedirs(model_path) model = Seq2seq(device=device, hidden_dim=HIDDEN_DIM, vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, dropout=DROPOUT, attn_lambda=0.0, pad_id=pad_id, full_model=True, invasive_uniform=args.uniform).to(device) train(model, num_epochs, batch_size, os.path.join(model_path, 'model'), custom_saves=custom_saves) model.load_state_dict(torch.load(os.path.join(model_path, 'model'))) print("BLEU score with beam search ", evaluate(model, val_data_manager, method='beam'))
def do_train_exp(): print 'Training mp_lstm Network' print 'Experimental settins:' print 'NUMBER EPOCH: %d' % (CFG['NUM_EPOCH']) print 'BATCH SIZE: %d' % (CFG['BATCH_SIZE']) print 40 * '-' print 40 * '-' print 'build model...' model, exp_func = exp_model() print 'model ok' print 40 * '*' num_samples = len(CFG['TRAIN']) num_batch = num_samples / CFG['BATCH_SIZE'] best_loss = np.inf acc = [] print 'training...' # eval_model.evaluate(exp_func['sent prob']) for iepoch in np.arange(CFG['NUM_EPOCH']): epoch_loss = 0. epoch_acc = 0. for ibatch in np.arange(num_batch): batch_idx = CFG['TRAIN'][ibatch * CFG['BATCH_SIZE']:(ibatch + 1) * CFG['BATCH_SIZE']] batch_data, batch_words, mask = data.get_batch_data(batch_idx) predict_words = np.reshape(batch_words, (-1, )) predict_words = predict_words[np.where(mask.flatten() == 1)] print batch_data.shape print batch_words.shape print mask.shape print predict_words.shape # word2sent(batch_words) print 'forward and backward...' batch_loss, batch_acc = exp_func['train func'](batch_data, batch_words, mask, predict_words) print '%d epoch %d batch: loss %f acc %f' % ( iepoch + 1, ibatch + 1, batch_loss, batch_acc) epoch_loss += batch_loss epoch_acc += batch_acc epoch_loss /= num_batch epoch_acc /= num_batch train_acc = epoch_acc logfile = open( 'logs/VIDCAP_MP/log_msvd_mplstm_' + time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+') print >> logfile, time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time()) ) + '\n script train loss:%f train acc:%f' % (epoch_loss, train_acc) logfile.close() print 'mean batch loss: %f' % epoch_loss print 'mean batch acc: %f' % epoch_acc print 40 * '-' if epoch_loss < best_loss: print 'find better training result.' print 'saving model' net_params = lasagne.layers.get_all_param_values( model[model['net name']]['word_prob']) modelfile = open('../models/VIDCAP_MP/msvd_mplstm_params.pkl', 'wb') cPickle.dump(net_params, modelfile) modelfile.close() if iepoch >= 2: print 'lets validating our model' eval_model.evaluate(exp_func['sent prob']) for i in np.arange(len(acc)): print '%d epoch acc %f' % (i + 1, acc[i])
def main(): parser = argparse.ArgumentParser() parser.add_argument('--task_name', default='NER', type=str) parser.add_argument('--data_dir', default='./datasets/cluener', type=str) parser.add_argument('--model_type', default='bert', type=str) parser.add_argument('--display', default='./display', type=str) parser.add_argument('--pretrain_model_path', default='./pretrained_model/bert-base-uncased/', type=str, required=False) parser.add_argument('--output_dir', default='./output/', type=str) parser.add_argument('--markup', default='bios', type=str, choices=['bios', 'bio']) parser.add_argument('--loss_type', default='ghmc', choices=['lsr', 'focal', 'ce', 'ghmc']) parser.add_argument('--max_seq_length', default=128, type=int) parser.add_argument("--do_lower_case", default=True) parser.add_argument('--do_train', default=True) parser.add_argument('--do_eval', default=True) parser.add_argument('--do_predict', default=False) parser.add_argument('--per_gpu_train_batch_size', default=128, type=int) parser.add_argument('--gradient_accumulation_steps', type=int, default=1) parser.add_argument('--learning_rate', default=5e-5, type=float) parser.add_argument("--weight_decay", default=0.0, type=float) parser.add_argument('--adam_epsilon', default=1e-8, type=float) parser.add_argument('--max_grad_norm', default=1.0, type=float) parser.add_argument('--num_train_epochs', default=8.0, type=float) parser.add_argument('--warmup_steps', default=0, type=int) parser.add_argument('--logging_steps', type=int, default=50) parser.add_argument('--save_steps', type=int, default=50) parser.add_argument('--no_cuda', default=False) parser.add_argument('--overwrite_output_dir', default=True) parser.add_argument('--seed', type=int, default=42) parser.add_argument('--fp16', default=True) parser.add_argument('--fp16_opt_level', type=str, default="O1") parser.add_argument('--local_rank', type=int, default=-1) parser.add_argument("--eval_count", type=int, default=0) args = parser.parse_args() if not os.path.exists(args.display): os.mkdir(args.display) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) args.output_dir = args.output_dir + f'{args.model_type}' if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device('cuda', args.local_rank) args.n_gpu = 1 args.device = device seed_everything(args.seed) process = DataProcess() label_list = process.get_labels() args.id2label = {i: label for i, label in enumerate(label_list)} args.label2id = {label: i for i, label in enumerate(label_list)} num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() config_class, model_class, tokenizer_class = BertConfig, BertSpanForNer, CNerTokenizer config = config_class.from_pretrained(args.pretrain_model_path, num_labels=num_labels, loss_type=args.loss_type, soft_label=True) tokenizer = tokenizer_class.from_pretrained( args.pretrain_model_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained(args.pretrain_model_path, config=config) model.to(args.device) writer = SummaryWriter(log_dir=args.display + '/' + time.strftime('%m_%d_%H.%M', time.localtime()) + '_' + str(args.loss_type)) if args.do_train: train_dataset = load_examples(args, tokenizer, data_type='train') global_step, train_loss = train(args, train_dataset, model, tokenizer, writer) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(args.output_dir) tokenizer.save_vocabulary(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( '/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, writer) if global_step: result = { "{}_{}".format(global_step, k): v for k, v in result.items() } results.update(result) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: for key in sorted(results.keys()): writer.write("{} = {}\n".format(key, str(results[key])))
def train(args, train_dataset, model, tokenizer, writer): args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate_fn) train_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=train_total) if os.path.isfile(os.path.join( args.pretrain_model_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.pretrain_model_path, "scheduler.pt")): optimizer.load_state_dict( torch.load(os.path.join(args.pretrain_model_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.pretrain_model_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) print("***** Running training *****") global_step = 0 steps_trained_in_current_epoch = 0 if os.path.exists(args.pretrain_model_path ) and "checkpoint" in args.pretrain_model_path: global_step = int( args.pretrain_model_path.split("-")[-1].split("/")[0]) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) train_loss, logging_loss = 0.0, 0.0 model.zero_grad() for _ in range(int(args.num_train_epochs)): pbar = ProgressBar(n_total=len(train_dataloader), desc='Training') for step, batch in enumerate(train_dataloader): if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "start_positions": batch[3], "end_positions": batch[4] } inputs["token_type_ids"] = (batch[2] if args.model_type in ["bert"] else None) outputs = model(**inputs) loss = outputs[0] writer.add_scalar("Train_loss", loss.item(), step) if args.n_gpu > 1: loss = loss.mean() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() pbar(step, {'loss': loss.item()}) train_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: if args.local_rank == -1: evaluate(args, model, tokenizer, writer) if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = (model.module if hasattr(model, "module") else model) model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) tokenizer.save_vocabulary(output_dir) print("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) print(" ") if 'cuda' in str(args.device): torch.cuda.empty_cache() return global_step, train_loss / global_step