def test(model, ema, args, data): device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() loss = 0 answers = dict() model.eval() backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) total_time = 0 previous_time = time.time() for batch in iter(data.dev_iter): #time1 = time.time() with torch.no_grad(): p1, p2 = model(batch.c_char,batch.q_char,batch.c_word[0],batch.q_word[0],batch.c_word[1],batch.q_word[1]) #p1, p2 = model(batch) #time2 = time.time() #total_time = total_time + time2 - time1 batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() # (batch, c_len, c_len) batch_size, c_len = p1.size() ls = nn.LogSoftmax(dim=1) mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand(batch_size, -1, -1) score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask score, s_idx = score.max(dim=1) score, e_idx = score.max(dim=1) s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze() for i in range(batch_size): id = batch.id[i] answer = batch.c_word[0][i][s_idx[i]:e_idx[i] + 1] answer = ' '.join([data.CONTEXT_WORD.vocab.itos[idx] for idx in answer]) if answer == "<eos>": answer = "" answers[id] = answer #print(f'one epoch time {time.time()-previous_time}') #print(f'total time {total_time}') for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) with open(args.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers), file=f) opts = evaluate.parse_args(args=[f"{args.dataset_file}", f"{args.prediction_file}" ]) results = evaluate.main(opts) return loss, results['exact'], results['f1'], results['HasAns_exact'], results['HasAns_f1'], results['NoAns_exact'], results['NoAns_f1']
def test(model, ema, args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") criterion = nn.CrossEntropyLoss() loss = 0 answers = dict() model.eval() backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) with torch.set_grad_enabled(False): for batch in iter(data.dev_iter): p1, p2 = model(batch) batch_loss = criterion(p1, batch.s_idx) + criterion( p2, batch.e_idx) loss += batch_loss.item() # (batch, c_len, c_len) batch_size, c_len = p1.size() ls = nn.LogSoftmax(dim=1) mask = (torch.ones(c_len, c_len) * float('-inf')).to(device).tril(-1).unsqueeze(0).expand( batch_size, -1, -1) score = (ls(p1).unsqueeze(2) + ls(p2).unsqueeze(1)) + mask score, s_idx = score.max(dim=1) score, e_idx = score.max(dim=1) s_idx = torch.gather(s_idx, 1, e_idx.view(-1, 1)).squeeze() for i in range(batch_size): id = batch.id[i] answer = batch.c_word[0][i][s_idx[i]:e_idx[i] + 1] answer = ' '.join( [data.WORD.vocab.itos[idx] for idx in answer]) answers[id] = answer for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) #print(answers) with open(args.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers, indent=4), file=f) results = evaluate.main(args, answers, data) return loss / len(data.dev_iter), results['exact_match'], results['f1']
def cw_tree_attack_targeted(): cw = CarliniL2_qa(debug=args.debugging) criterion = nn.CrossEntropyLoss() loss = 0 tot = 0 adv_loss = 0 targeted_success = 0 untargeted_success = 0 adv_text = [] answers = dict() adv_answers = dict() # model.eval() embed = torch.load(args.word_vector) device = torch.device("cuda:0" if args.cuda else "cpu") vocab = Vocab(filename=args.dictionary, data=[PAD_WORD, UNK_WORD, EOS_WORD, SOS_WORD]) generator = Generator(args.test_data, vocab=vocab, embed=embed) transfered_embedding = torch.load('bidaf_transfered_embedding.pth') transfer_emb = torch.nn.Embedding.from_pretrained(transfered_embedding).to( device) seqback = WrappedSeqback(embed, device, attack=True, seqback_model=generator.seqback_model, vocab=vocab, transfer_emb=transfer_emb) treelstm = generator.tree_model generator.load_state_dict(torch.load(args.load_ae)) backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) class TreeModel(nn.Module): def __init__(self): super(TreeModel, self).__init__() self.inputs = None def forward(self, hidden): self.embedding = seqback(hidden) return model(batch, perturbed=self.embedding) def set_temp(self, temp): seqback.temp = temp def get_embedding(self): return self.embedding def get_seqback(self): return seqback tree_model = TreeModel() for batch in tqdm(iter(data.dev_iter), total=1000): p1, p2 = model(batch) orig_answer, orig_s_idx, orig_e_idx = write_to_ans( p1, p2, batch, answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() append_info = append_input(batch, vocab) batch_add_start = append_info['add_start'] batch_add_end = append_info['add_end'] batch_start_target = torch.LongTensor( append_info['target_start']).to(device) batch_end_target = torch.LongTensor( append_info['target_end']).to(device) add_sents = append_info['append_sent'] input_embedding = model.word_emb(batch.c_word[0]) append_info['tree'] = [generator.get_tree(append_info['tree'])] seqback.sentences = input_embedding.clone().detach() seqback.batch_trees = append_info['tree'] seqback.batch_add_sent = append_info['ae_sent'] seqback.start = append_info['add_start'] seqback.end = append_info['add_end'] seqback.adv_sent = [] batch_tree_embedding = [] for bi, append_sent in enumerate(append_info['ae_sent']): seqback.target_start = append_info['target_start'][ 0] - append_info['add_start'][0] seqback.target_end = append_info['target_end'][0] - append_info[ 'add_start'][0] sentences = [ torch.tensor(append_sent, dtype=torch.long, device=device) ] seqback.target = sentences[0][seqback. target_start:seqback.target_end + 1] trees = [append_info['tree'][bi]] tree_embedding = treelstm(sentences, trees)[0][0].detach() batch_tree_embedding.append(tree_embedding) hidden = torch.cat(batch_tree_embedding, dim=0) cw.batch_info = append_info cw.num_classes = append_info['tot_length'] adv_hidden = cw.run(tree_model, hidden, (batch_start_target, batch_end_target), input_token=input_embedding) seqback.adv_sent = [] # re-test for bi, (add_start, add_end) in enumerate(zip(batch_add_start, batch_add_end)): if bi in cw.o_best_sent: ae_words = cw.o_best_sent[bi] bidaf_tokens = bidaf_convert_to_idx(ae_words) batch.c_word[0].data[bi, add_start:add_end] = torch.LongTensor( bidaf_tokens) p1, p2 = model(batch) adv_answer, adv_s_idx, adv_e_idx = write_to_ans( p1, p2, batch, adv_answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) adv_loss += batch_loss.item() for bi, (start_target, end_target) in enumerate( zip(batch_start_target, batch_end_target)): start_output = adv_s_idx end_output = adv_e_idx targeted_success += int( compare(start_output, start_target.item(), end_output, end_target.item())) untargeted_success += int( compare_untargeted(start_output, start_target.item(), end_output, end_target.item())) for i in range(len(add_sents)): logger.info(("orig:", transform(add_sents[i]))) try: logger.info(("adv:", cw.o_best_sent[i])) adv_text.append({ 'adv_text': cw.o_best_sent[i], 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') except: adv_text.append({ 'adv_text': transform(add_sents[i]), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') continue # for batch size = 1 tot += 1 logger.info(("orig predict", (orig_s_idx, orig_e_idx))) logger.info(("adv append predict", (adv_s_idx, adv_e_idx))) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("Orig answer:", orig_answer)) logger.info(("Adv answer:", adv_answer)) logger.info(("tot:", tot)) for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) with open(options.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers), file=f) with open(options.prediction_file + '_adv.json', 'w', encoding='utf-8') as f: print(json.dumps(adv_answers), file=f) results = evaluate.main(options) logger.info(tot) logger.info(("adv loss, results['exact_match'], results['f1']", loss, results['exact_match'], results['f1'])) return loss, results['exact_match'], results['f1']
def cw_random_word_attack(): cw = CarliniL2_untargeted_qa(debug=args.debugging) criterion = nn.CrossEntropyLoss() loss = 0 adv_loss = 0 targeted_success = 0 untargeted_success = 0 adv_text = [] answers = dict() adv_answers = dict() backup_params = EMA(0) for name, param in model.named_parameters(): if param.requires_grad: backup_params.register(name, param.data) param.data.copy_(ema.get(name)) tot = 0 for batch in tqdm(iter(data.dev_iter), total=1000): p1, p2 = model(batch) orig_answer, orig_s_idx, orig_e_idx = write_to_ans( p1, p2, batch, answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() append_info = append_random_input(batch) allow_idxs = append_info['allow_idx'] batch_start_target = torch.LongTensor([0]).to(device) batch_end_target = torch.LongTensor([0]).to(device) input_embedding = model.word_emb(batch.c_word[0]) cw_mask = np.zeros(input_embedding.shape).astype(np.float32) cw_mask = torch.from_numpy(cw_mask).float().to(device) for bi, allow_idx in enumerate(allow_idxs): cw_mask[bi, np.array(allow_idx)] = 1 cw.wv = model.word_emb.weight cw.inputs = batch cw.mask = cw_mask cw.batch_info = append_info cw.num_classes = append_info['tot_length'] # print(transform(to_list(batch.c_word[0][0]))) cw.run(model, input_embedding, (batch_start_target, batch_end_target)) # re-test for bi, allow_idx in enumerate(allow_idxs): if bi in cw.o_best_sent: for i, idx in enumerate(allow_idx): batch.c_word[0].data[bi, idx] = cw.o_best_sent[bi][i] p1, p2 = model(batch) adv_answer, adv_s_idx, adv_e_idx = write_to_ans( p1, p2, batch, adv_answers) batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) adv_loss += batch_loss.item() for bi, (start_target, end_target) in enumerate( zip(batch_start_target, batch_end_target)): start_output = adv_s_idx end_output = adv_e_idx targeted_success += int( compare(start_output, start_target.item(), end_output, end_target.item())) untargeted_success += int( compare_untargeted(start_output, start_target.item(), end_output, end_target.item())) for i in range(len(allow_idxs)): try: logger.info(("adv:", transform(cw.o_best_sent[i]))) adv_text.append({ 'added_text': transform(cw.o_best_sent[i]), 'adv_text': transform(to_list(batch.c_word[0][0])), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') except: adv_text.append({ 'adv_text': transform(to_list(batch.c_word[0][0])), 'qas_id': batch.id[i], 'adv_predict': (orig_s_idx, orig_e_idx), 'orig_predict': (adv_s_idx, adv_e_idx), 'Orig answer:': orig_answer, 'Adv answer:': adv_answer }) joblib.dump(adv_text, root_dir + '/adv_text.pkl') continue # for batch size = 1 tot += 1 logger.info(("orig predict", (orig_s_idx, orig_e_idx))) logger.info(("adv append predict", (adv_s_idx, adv_e_idx))) logger.info(("targeted successful rate:", targeted_success)) logger.info(("untargetd successful rate:", untargeted_success)) logger.info(("Orig answer:", orig_answer)) logger.info(("Adv answer:", adv_answer)) logger.info(("tot:", tot)) for name, param in model.named_parameters(): if param.requires_grad: param.data.copy_(backup_params.get(name)) with open(options.prediction_file, 'w', encoding='utf-8') as f: print(json.dumps(answers), file=f) with open(options.prediction_file + '_adv.json', 'w', encoding='utf-8') as f: print(json.dumps(adv_answers), file=f) results = evaluate.main(options) logger.info(tot) logger.info(("adv loss, results['exact_match'], results['f1']", loss, results['exact_match'], results['f1'])) return loss, results['exact_match'], results['f1']
question_append_sentences = joblib.load( 'sampled_perturb_question_sentences.pkl') model = BiDAF(options, data.WORD.vocab.vectors).to(device) if options.old_model is not None: model.load_state_dict( torch.load(options.old_model, map_location="cuda:{}".format(options.gpu))) if options.old_ema is not None: # ema = pickle.load(open(options.old_ema, "rb")) ema = torch.load(options.old_ema, map_location=device) else: ema = EMA(options.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) random.seed(args.seed) if args.model == 'word_attack': # dev_loss, dev_exact, dev_f1 = cw_word_attack() # dev_loss, dev_exact, dev_f1 = cw_word_attack_target() dev_loss, dev_exact, dev_f1 = cw_random_word_attack()
def train(args): db = Data(args) # db.build_vocab() # 每次build_vocab,相同频数的字词id可能不同 db.load_vocab() db.build_dataset() # 得到train_loader model = BiDAF(args) if args.cuda: model = model.cuda() if args.ema: ema = EMA(0.999) print("Register EMA ...") for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) init_lr = args.init_lr optimizer = torch.optim.Adam(params=model.parameters(), lr=init_lr) lr = init_lr batch_step = args.batch_step loss_fn = nn.CrossEntropyLoss() logger = Logger('./logs') step = 0 valid_raw_article_list = db.valid_raw_article_list valid_answer_list = db.valid_answer_list print('========== Train ==============') for epoch in range(args.epoch_num): print('---Epoch', epoch, "lr:", lr) running_loss = 0.0 count = 0 print("len(db.train_loader):", len(db.train_loader)) for article, question, answer_span, _ in db.train_loader: if args.cuda: article, question, answer_span = article.cuda(), question.cuda( ), answer_span.cuda() p1, p2 = model(article, question) loss_p1 = loss_fn(p1, answer_span.transpose(0, 1)[0]) loss_p2 = loss_fn(p2, answer_span.transpose(0, 1)[1]) running_loss += loss_p1.item() running_loss += loss_p2.item() optimizer.zero_grad() (loss_p1 + loss_p2).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 2) optimizer.step() if args.ema: for name, param in model.named_parameters(): if param.requires_grad: param.data = ema(name, param.data) count += 1 if count % batch_step == 0: rep_str = '[{}] Epoch {}, loss: {:.3f}' print( rep_str.format( datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), epoch, running_loss / batch_step)) info = {'loss': running_loss / batch_step} running_loss = 0.0 count = 0 # 1. Log scalar values (scalar summary) for tag, value in info.items(): logger.scalar_summary(tag, value, step + 1) # 2. Log values and gradients of the parameters (histogram summary) for tag, value in model.named_parameters(): tag = tag.replace('.', '/') logger.histo_summary(tag, value.data.cpu().numpy(), step + 1) logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), step + 1) step += 1 # 验证集 if args.with_valid: print('======== Epoch {} result ========'.format(epoch)) print("len(db.valid_loader):", len(db.valid_loader)) valid_result = [] idx = 0 for article, question, _ in db.valid_loader: if args.cuda: article, question = article.cuda(), question.cuda() p1, p2 = model(article, question, is_trainning=False) _, p1_predicted = torch.max(p1.cpu().data, 1) _, p2_predicted = torch.max(p2.cpu().data, 1) p1_predicted = p1_predicted.numpy().tolist() p2_predicted = p2_predicted.numpy().tolist() for _p1, _p2, _raw_article, _answer in zip( p1_predicted, p2_predicted, valid_raw_article_list[idx:idx + len(p1_predicted)], valid_answer_list[idx:idx + len(p1_predicted)]): valid_result.append({ "ref_answer": _answer, "cand_answer": "".join(_raw_article[_p1:_p2 + 1]) }) idx = idx + len(p1_predicted) rouge_score = test_score(valid_result) info = {'rouge_score': rouge_score} for tag, value in info.items(): logger.scalar_summary(tag, value, epoch + 1) lr = max(0.00001, init_lr * 0.9**(epoch + 1)) print("lr:", lr) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(params=parameters, lr=lr, weight_decay=1e-7) # print(len(db.valid_loader)) if epoch >= 1 and args.saved_model_file: torch.save(model.state_dict(), args.saved_model_file + "_epoch_" + str(epoch)) print("saved model")
def train(args, data): device = torch.device( "cuda:{}".format(args.gpu) if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print('train loss: {} / dev loss: {}'.format(loss, dev_loss) + ' / dev EM: {} / dev F1: {}'.format(dev_exact, dev_f1)) if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print('max dev EM: {} / max dev F1: {}'.format(max_dev_exact, max_dev_f1)) return best_model
def train(args): db = Data(args) # db.build_vocab() # 每次build_vocab,相同频数的字词id可能不同 db.load_vocab() db.build_dataset() # 得到train_loader # model = BiDAF(args) model = SLQA(args) first_model = "./checkpoints/SLQA_elmo_epoch_0" model.load_state_dict(torch.load(first_model)) if args.cuda: model = model.cuda() if args.ema: ema = EMA(0.999) print("Register EMA ...") for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) init_lr = args.init_lr parameters = filter(lambda param: param.requires_grad, model.parameters()) weight_decay = 1e-6 weight_decay = 0 optimizer = torch.optim.Adam(params=parameters, lr=init_lr, weight_decay=weight_decay) batch_step = args.batch_step loss_fn = nn.CrossEntropyLoss() logger = Logger('./logs') step = 0 train_raw_article_list = db.train_raw_article_list train_raw_question_list = db.train_raw_question_list valid_raw_article_list = db.valid_raw_article_list valid_answer_list = db.valid_answer_list valid_raw_question_list = db.valid_raw_question_list # question_hdf5_f = h5py.File(args.question_hdf5_path, "r") # article_hdf5_f = h5py.File(args.article_hdf5_path, "r") print('========== Train ==============') for epoch in range(args.epoch_num): print('---Epoch', epoch) running_loss = 0.0 count = 0 print("len(db.train_loader):", len(db.train_loader)) train_idx = 0 for batch_id, (article, question, answer_span, _) in enumerate(db.train_loader): if args.cuda: article, question, answer_span = article.cuda(), question.cuda( ), answer_span.cuda() # tmp_train_raw_article_list = train_raw_article_list[train_idx:train_idx + question.size()[0]] # tmp_train_raw_question_list = train_raw_question_list[train_idx:train_idx + question.size()[0]] # question_elmo = gen_elmo_by_text(question_hdf5_f, tmp_train_raw_question_list, args.max_question_len) # article_elmo = gen_elmo_by_text(article_hdf5_f, tmp_train_raw_article_list, args.max_article_len) # pickle.dump((article_elmo, question_elmo), open(elmo_save_path, "wb")) elmo_save_path = "/backup231/lhliu/jszn/elmo/" + str( batch_id) + ".pkl" article_elmo, question_elmo = pickle.load( open(elmo_save_path, "rb")) # print(elmo_save_path) article_elmo = torch.tensor(article_elmo, dtype=torch.float) question_elmo = torch.tensor(question_elmo, dtype=torch.float) # train_idx += question.size()[0] # continue if args.cuda: question_elmo = question_elmo.cuda() article_elmo = article_elmo.cuda() p1, p2 = model(article, question, article_elmo=article_elmo, question_elmo=question_elmo) loss_p1 = loss_fn(p1, answer_span.transpose(0, 1)[0]) loss_p2 = loss_fn(p2, answer_span.transpose(0, 1)[1]) running_loss += loss_p1.item() running_loss += loss_p2.item() optimizer.zero_grad() (loss_p1 + loss_p2).backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 2) optimizer.step() if args.ema: for name, param in model.named_parameters(): if param.requires_grad: param.data = ema(name, param.data) count += 1 if count % batch_step == 0: rep_str = '[{}] Epoch {}, loss: {:.3f}' print( rep_str.format( datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), epoch, running_loss / batch_step)) # info = {'loss': running_loss / batch_step} running_loss = 0.0 count = 0 # # 1. Log scalar values (scalar summary) # for tag, value in info.items(): # logger.scalar_summary(tag, value, step + 1) # # 2. Log values and gradients of the parameters (histogram summary) # for tag, value in model.named_parameters(): # tag = tag.replace('.', '/') # logger.histo_summary(tag, value.data.cpu().numpy(), step + 1) # logger.histo_summary(tag + '/grad', value.grad.data.cpu().numpy(), step + 1) step += 1 # break # 验证集 if args.with_valid: print('======== Epoch {} result ========'.format(epoch)) print("len(db.valid_loader):", len(db.valid_loader)) valid_result = [] idx = 0 for article, question, _ in db.valid_loader: if args.cuda: article, question = article.cuda(), question.cuda() tmp_valid_raw_article_list = valid_raw_article_list[idx:idx + question. size()[0]] tmp_valid_raw_question_list = valid_raw_question_list[ idx:idx + question.size()[0]] question_elmo = gen_elmo_by_text(question_hdf5_f, tmp_valid_raw_question_list, args.max_question_len) article_elmo = gen_elmo_by_text(article_hdf5_f, tmp_valid_raw_article_list, args.max_article_len) if args.cuda: question_elmo = question_elmo.cuda() article_elmo = article_elmo.cuda() p1, p2 = model(article, question, article_elmo, question_elmo, is_training=False) _, p1_predicted = torch.max(p1.cpu().data, 1) _, p2_predicted = torch.max(p2.cpu().data, 1) p1_predicted = p1_predicted.numpy().tolist() p2_predicted = p2_predicted.numpy().tolist() assert question.size()[0] == len(p1_predicted) for _p1, _p2, _raw_article, _answer in zip( p1_predicted, p2_predicted, valid_raw_article_list[idx:idx + len(p1_predicted)], valid_answer_list[idx:idx + len(p1_predicted)]): valid_result.append({ "ref_answer": _answer, "cand_answer": "".join(_raw_article[_p1:_p2 + 1]) }) idx = idx + len(p1_predicted) rouge_score = test_score(valid_result) info = {'rouge_score': rouge_score} for tag, value in info.items(): logger.scalar_summary(tag, value, epoch + 1) #lr = init_lr lr = max(0.00001, init_lr * 0.9**(epoch + 1)) # 考虑是否使用 print("lr:", lr) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = torch.optim.Adam(params=parameters, lr=lr, weight_decay=weight_decay) # print(len(db.valid_loader)) if epoch >= 0 and args.saved_model_file: torch.save(model.state_dict(), args.saved_model_file + "_epoch_" + str(epoch)) print("saved model")
def train(args, data): if args.load_model != "": model = BiDAF(args, data.WORD.vocab.vectors) model.load_state_dict(torch.load(args.load_model)) else: model = BiDAF(args, data.WORD.vocab.vectors) device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = model.to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) for name, i in model.named_parameters(): if not i.is_leaf: print(name,i) writer = SummaryWriter(log_dir='runs/' + args.model_name) best_model = None for iterator, dev_iter, dev_file_name, index, print_freq, lr in zip(data.train_iter, data.dev_iter, args.dev_files, range(len(data.train)), args.print_freq, args.learning_rate): # print # (iterator[0]) embed() exit(0) optimizer = optim.Adadelta(model.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() model.train() loss, last_epoch = 0, 0 max_dev_exact, max_dev_f1 = -1, -1 print(f"Training with {dev_file_name}") print() for i, batch in tqdm(enumerate(iterator), total=len(iterator) * args.epoch[index], ncols=100): present_epoch = int(iterator.epoch) eva = False if present_epoch == args.epoch[index]: break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) eva = True last_epoch = present_epoch p1, p2 = model(batch) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) torch.cuda.empty_cache() if (i + 1) % print_freq == 0 or eva: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data, dev_iter, dev_file_name) c = (i + 1) // print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print() print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') print("testing with test batch on best model") test_loss, test_exact, test_f1 = test(best_model, ema, args, data, list(data.test_iter)[-1], args.test_files[-1]) print(f'test loss: {test_loss:.3f}' f' / test EM: {test_exact:.3f} / test F1: {test_f1:.3f}') return best_model
def train(args, data): device = torch.device(f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.CONTEXT_WORD.vocab.vectors).to(device) num = count_parameters(model) print(f'paramter {num}') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = nn.DataParallel(model) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 print('totally {} epoch'.format(args.epoch)) sys.stdout.flush() iterator = data.train_iter iterator.repeat = True for i, batch in enumerate(iterator): present_epoch = int(iterator.epoch) if present_epoch == args.epoch: print('present_epoch value:',present_epoch) break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch p1, p2 = model(batch.c_char,batch.q_char,batch.c_word[0],batch.q_word[0],batch.c_word[1],batch.q_word[1]) optimizer.zero_grad() batch_loss = criterion(p1, batch.s_idx) + criterion(p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) if (i + 1) % args.print_freq == 0: dev_loss, dev_exact, dev_f1, dev_hasans_exact, dev_hasans_f1, dev_noans_exact,dev_noans_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}' f' / dev hasans EM: {dev_hasans_exact} / dev hasans F1: {dev_hasans_f1}' f' / dev noans EM: {dev_noans_exact} / dev noans F1: {dev_noans_f1}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() sys.stdout.flush() writer.close() args.max_f1 = max_dev_f1 print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args, data.WORD.vocab.vectors).to(device) ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 iterator = data.train_iter num_batch = len(iterator) for present_epoch in range(args.epoch): print('epoch', present_epoch + 1) for i, batch in enumerate(iterator): # present_epoch = int(iterator.epoch) """ if present_epoch == args.epoch: print(present_epoch) print() print(args.epoch) break if present_epoch > last_epoch: print('epoch:', present_epoch + 1) last_epoch = present_epoch """ p1, p2 = model(batch) optimizer.zero_grad() """ print(p1) print() print(batch.s_idx) """ if len(p1.size()) == 1: p1 = p1.reshape(1, -1) if len(p2.size()) == 1: p2 = p2.reshape(1, -1) batch_loss = criterion(p1, batch.s_idx) + criterion( p2, batch.e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) best_model = copy.deepcopy(model) if i + 1 == num_batch: dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) c = (i + 1) // args.print_freq writer.add_scalar('loss/train', loss / num_batch, c) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('exact_match/dev', dev_exact, c) writer.add_scalar('f1/dev', dev_f1, c) print( f'train loss: {loss/num_batch:.3f} / dev loss: {dev_loss:.3f}' f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 max_dev_exact = dev_exact best_model = copy.deepcopy(model) loss = 0 model.train() writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model
def train(args, data): device = torch.device( f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu") model = BiDAF(args).to(device) D_batch = args.train_batch_size ema = EMA(args.exp_decay_rate) for name, param in model.named_parameters(): if param.requires_grad: ema.register(name, param.data) parameters = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adadelta(parameters, lr=args.learning_rate) criterion = nn.CrossEntropyLoss() # writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() loss, last_epoch = 0, -1 max_dev_exact, max_dev_f1 = -1, -1 i = 0 # iterator = data.train_iter while i + D_batch < len(data.data): b_id = i e_id = i + D_batch # present_epoch = int(iterator.epoch) # if present_epoch == args.epoch: # break # if present_epoch > last_epoch: # print('epoch:', present_epoch + 1) # last_epoch = present_epoch p1, p2 = model(data, b_id, e_id) optimizer.zero_grad() s_idx, e_idx = data.get_targ(b_id, e_id) batch_loss = criterion(p1, s_idx) + criterion(p2, e_idx) loss += batch_loss.item() batch_loss.backward() optimizer.step() for name, param in model.named_parameters(): if param.requires_grad: ema.update(name, param.data) # if (i + 1) % args.print_freq == 0: # dev_loss, dev_exact, dev_f1 = test(model, ema, args, data) # c = (i + 1) // args.print_freq # # writer.add_scalar('loss/train', loss, c) # # writer.add_scalar('loss/dev', dev_loss, c) # # writer.add_scalar('exact_match/dev', dev_exact, c) # # writer.add_scalar('f1/dev', dev_f1, c) # # print(f'train loss: {loss:.3f} / dev loss: {dev_loss:.3f}' # # f' / dev EM: {dev_exact:.3f} / dev F1: {dev_f1:.3f}') # if dev_f1 > max_dev_f1: # max_dev_f1 = dev_f1 # max_dev_exact = dev_exact # best_model = copy.deepcopy(model) # loss = 0 # model.train() i += D_batch # writer.close() print(f'max dev EM: {max_dev_exact:.3f} / max dev F1: {max_dev_f1:.3f}') return best_model