def evaluate(model, X_test, best_result, patience, is_test=False): y_pred, y_label = predict(model, X_test) metrics = Metrics(args.score_file_path) with open(args.score_file_path, 'w') as output: for score, label in zip(y_pred, y_label): output.write(str(score) + '\t' + str(label) + '\n') result = metrics.evaluate_all_metrics() if not is_test and result[0] + result[1] + result[2] > best_result[ 0] + best_result[1] + best_result[2]: # tqdm.write("save model!!!") best_result = result tqdm.write("Best Result: R1: %.4f R2: %.4f R5: %.4f" % (best_result[0], best_result[1], best_result[2])) logger.info("Best Result: R1: %.4f R2: %.4f R5: %.4f" % (best_result[0], best_result[1], best_result[2])) model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), args.save_path) else: patience += 1 if is_test: print("Best Result: R1: %.4f R2: %.4f R5: %.4f" % (best_result[0], best_result[1], best_result[2])) return best_result, patience
class NeuralNetwork(nn.Module): def __init__(self): super(NeuralNetwork, self).__init__() self.patience = 0 self.init_clip_max_norm = 2.0# bert adam 에선 전멸임.. self.optimizer = None self.best_result = [0, 0, 0, 0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') def forward(self): raise NotImplementedError def convert_examples_to_features(self,X_train_utterances, X_train_responses, tokenizer,Utterance_len,Response_len): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ # tokens_a=tokenizer.convert_tokens_to_ids("[CLS]") # for context in X_train_utterances: # for utterance in context: # X_train_utterances, #label_map = {label: i for i, label in enumerate(y_train)} maxbertlen=256 features = [] for (ex_index, (utterances ,response,length_utter)) in enumerate(zip(X_train_utterances,X_train_responses,Utterance_len)): if ex_index % 10000 == 0: logger.info("Writing example %d of %d" % (ex_index, len(utterances))) #토큰 인덱스로 변형된거 읽어와서 CLS,SEP 끝에 붙여줌. tokens_a=[] utterlen=[] for utterance in utterances: utterlen.append(len(utterance)) tokens_a=tokens_a+ utterance+[tokenizer.convert_tokens_to_ids("_eos_")] # print(tokenizer.convert_ids_to_tokens(tokens_a)) tokens_a = [tokenizer.cls_token_id] + tokens_a + [tokenizer.sep_token_id] # if(len(response)>51): "토큰단위라 클수도 있다." # print("something wrong") tokens_b = response+[tokenizer.sep_token_id] utterlen.append(len(response)) # if response==53: # print('53') if (len(utterlen)!=11):#문장이 아에 없을때.. utterlen=[0]*(11-len(utterlen))+utterlen input_ids = tokens_a + tokens_b if len(input_ids)>maxbertlen: input_ids=[tokenizer.cls_token_id]+input_ids[-maxbertlen+1:] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. segment_ids = [0] * (len(input_ids) - len(tokens_b))# 컨텍스트 다합친거. segment_ids += [1] * len(tokens_b) # #이건 리스폰스. if len(input_ids)>350: print(len(input_ids)) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. padding_length = maxbertlen - len(input_ids) if (padding_length>0): input_ids = input_ids + ([tokenizer.pad_token_id] * padding_length) input_mask = input_mask + ([tokenizer.pad_token_id] * padding_length) segment_ids = segment_ids + ([tokenizer.pad_token_id] * padding_length)#패딩은 0이다. # assert len(input_ids) == 256 # assert len(input_mask) == 256 # assert len(segment_ids) == 256 #label_id=y_train[ex_index] #label_id = label_map[example.label] if ex_index < 1: logger.info("*** Example ***") logger.info("tokens_idx: %s" % " ".join( [str(x) for x in input_ids])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) #logger.info("label: %d " % (utter)) features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, utterlen=utterlen)) batch_length=[] for Ui_len,ri_len in zip(Utterance_len,Response_len): length=[] for uij_len in Ui_len: length+=uij_len length+=ri_len length+=[0]*(300-len(length)) if len(length)>300: #print("300넘음") length=length[-300:] batch_length.append(length) return features, batch_length def train_step(self, i, data): with torch.no_grad(): batch_ids,batch_mask,batch_seg,batch_utterlen,batch_y,batch_word_len = (item.cuda(device=self.device) for item in data) self.optimizer.zero_grad() logits = self.forward([batch_ids,batch_mask,batch_seg,batch_utterlen,batch_word_len]) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() if i%10==0: print('Batch[{}] - loss: {:.6f} batch_size:{}'.format(i, loss.item(),batch_y.size(0)) ) # , accuracy, corrects return loss def fit(self, X_train_utterances, X_train_responses, y_train, ############################여기가 메인임. X_dev_utterances, X_dev_responses, y_dev, tokenizer, B_train_utterances_len,B_train_responses_len, B_dev_utterances_len, B_dev_responses_len): if torch.cuda.is_available(): self.cuda() features,length =self.convert_examples_to_features(X_train_utterances, X_train_responses,tokenizer,B_train_utterances_len,B_train_responses_len) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_utterlen_ids = torch.tensor([f.utter_len for f in features], dtype=torch.long)#배치당 한 컨텍스트 리스폰스 세트임.f는 고로 한개의 y_labels = torch.FloatTensor(y_train) length = torch.IntTensor(length) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterlen_ids, y_labels,length) #dataset = DialogueDataset(X_train_utterances, X_train_responses, y_train)#아직은 인덱스인데 여기서 tensor로 바뀌는게 문제임. #이것도 인덱스. dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True) self.loss_func = nn.BCELoss() # self.optimizer =optim.Adam(self.parameters(), lr=self.args.learning_rate, weight_decay=self.args.l2_reg) if self.args.no_bert is True: optimizer_grouped_parameters = [ {'params': [p for n, p in self.named_parameters() if 'bert_model' not in n] } ] print("bert 동결 함") self.optimizer = AdamW(optimizer_grouped_parameters, lr=1e-3,weight_decay=self.args.l2_reg, correct_bias=True) else: no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.l2_reg}, {'params': [p for n, p in self.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] print("bert 학습중") self.optimizer=AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, weight_decay=self.args.l2_reg, correct_bias=True) for epoch in range(self.args.epochs): print("\nEpoch ", epoch+1, "/", self.args.epochs) avg_loss = 0 self.train() for i, data in enumerate(dataloader):#원래 배치는 200 loss = self.train_step(i, data) if i > 0 and i % 500000== 0:#200*500 십만..지금은 16 self.evaluate(X_dev_utterances, X_dev_responses, y_dev, tokenizer,B_dev_utterances_len, B_dev_responses_len) self.train() if epoch >= 2 and self.patience >= 1: print("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) if self.args.no_bert is True: self.adjust_learning_rate(0.6) else: self.adjust_learning_rate(0.8) self.patience = 0 if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) avg_loss += loss.item() cnt = len(y_train) // self.args.batch_size + 1 print("Average loss:{:.6f} ".format(avg_loss/cnt)) self.evaluate(X_dev_utterances, X_dev_responses, y_dev,tokenizer,B_dev_utterances_len, B_dev_responses_len) def adjust_learning_rate(self, decay_rate=.8): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] print("Decay learning rate to: ", self.args.learning_rate) def evaluate(self, X_dev_utterances, X_dev_responses, y_dev,tokenizer,B_dev_utterances_len,B_dev_responses_len,is_test=False,): y_pred = self.predict(X_dev_utterances, X_dev_responses,tokenizer,B_dev_utterances_len,B_dev_responses_len) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, y_dev): output.write( str(score) + '\t' + str(label) + '\n' ) result = self.metrics.evaluate_all_metrics() print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:", result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t", "R2:", result[4], "\t", "R5:", result[5]) if not is_test and result[3] + result[4] + result[5] > self.best_result[3] + self.best_result[4] + self.best_result[5]: print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:", self.best_result[1], "\t", "P@1:", self.best_result[2], "\t", "R1:", self.best_result[3], "\t", "R2:", self.best_result[4], "\t", "R5:", self.best_result[5]) self.patience = 0 self.best_result = result torch.save(self.state_dict(), self.args.save_path) print("save model!!!\n") else: self.patience += 1 def predict(self, X_dev_utterances, X_dev_responses,tokenizer,B_dev_utterances_len,B_dev_responses_len): self.eval() y_pred = [] features ,length= self.convert_examples_to_features(X_dev_utterances, X_dev_responses, tokenizer,B_dev_utterances_len,B_dev_responses_len) # for f in features: # print(f.input_ids) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_utterlen_ids = torch.tensor([f.utter_len for f in features],dtype=torch.long) # 배치당 한 컨텍스트 리스폰스 세트임.f는 고로 한개의 length = torch.IntTensor(length) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterlen_ids,length)#여기도 만개로 수정했음. #dataset = DialogueDataset(X_dev_utterances, X_dev_responses) dataloader = DataLoader(dataset, batch_size=128) for i, data in enumerate(dataloader): with torch.no_grad(): batch_ids, batch_mask, batch_seg, batch_utterlen,batch_word_len= (item.cuda() for item in data) with torch.no_grad(): logits = self.forward([batch_ids, batch_mask, batch_seg, batch_utterlen,batch_word_len]) if i % 10==0: print('Batch[{}] batch_size:{}'.format(i, batch_ids.size(0))) # , accuracy, corrects y_pred += logits.data.cpu().numpy().tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) if torch.cuda.is_available(): self.cuda()
class NeuralNetwork(nn.Module): def __init__(self, args): super(NeuralNetwork, self).__init__() self.args = args self.patience = 0 self.init_clip_max_norm = 5.0 self.optimizer = None self.best_result = [0, 0, 0, 0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') config_class, model_class, tokenizer_class = MODEL_CLASSES[ args.model_type] self.bert_config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, finetuning_task="classification", num_labels=1) self.bert_tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) special_tokens_dict = {'eos_token': '[eos]'} num_added_toks = self.bert_tokenizer.add_special_tokens( special_tokens_dict) self.bert_model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=self.bert_config) self.bert_model.resize_token_embeddings(len(self.bert_tokenizer)) self.bert_model = self.bert_model.cuda() ''' self.attn = nn.Linear(300, 300) self.rnn1 = nn.GRU( input_size=768, hidden_size=300, num_layers=1, batch_first=True, bidirectional=False ) self.bilinear=nn.Bilinear(600,600,1) ''' # multihop self.transformer_utt = TransformerBlock(input_size=768) self.transformer_eu = TransformerBlock(input_size=768) self.transformer_ru = TransformerBlock(input_size=768) self.transformer_ett = TransformerBlock(input_size=768) self.transformer_ue = TransformerBlock(input_size=768) self.transformer_re = TransformerBlock(input_size=768) self.transformer_rtt = TransformerBlock(input_size=768) self.transformer_ur = TransformerBlock(input_size=768) self.transformer_er = TransformerBlock(input_size=768) self._projection = nn.Sequential(nn.Linear(4 * 768, 200), nn.ReLU()) self.rnn2 = nn.GRU(input_size=200, hidden_size=200, num_layers=1, batch_first=True, bidirectional=True) self._classification = nn.Sequential(nn.Dropout(p=0.2), nn.Linear(2 * 6 * 200, 200), nn.Tanh(), nn.Dropout(p=0.2), nn.Linear(200, 1)) def forward(self): raise NotImplementedError def get_Matching_Map(self, bU_embedding, bE_embedding, bR_embedding, umask, emask, rmask): ''' :param bU_embedding: (batch_size*max_utterances, max_u_words, embedding_dim) :param bR_embedding: (batch_size*max_utterances, max_r_words, embedding_dim) :return: E: (bsz*max_utterances, max_u_words, max_r_words) ''' #1셀프,2크로스,3셀프-크로스4,셀프-크로스 elementwise product Hutt = self.transformer_utt(bU_embedding, bU_embedding, bU_embedding) Hue = self.transformer_ue(bU_embedding, bE_embedding, bE_embedding) Hur = self.transformer_ur(bU_embedding, bR_embedding, bR_embedding) Hett = self.transformer_ett(bE_embedding, bE_embedding, bE_embedding) Heu = self.transformer_eu(bE_embedding, bU_embedding, bU_embedding) Her = self.transformer_er(bE_embedding, bR_embedding, bR_embedding) Hrtt = self.transformer_rtt(bR_embedding, bR_embedding, bR_embedding) Hru = self.transformer_ru(bR_embedding, bU_embedding, bU_embedding) Hre = self.transformer_re(bR_embedding, bE_embedding, bE_embedding) #utterance ue_input = torch.cat((Hutt, Hue, Hutt - Hue, Hutt * Hue), dim=-1) ur_input = torch.cat((Hutt, Hur, Hutt - Hur, Hutt * Hur), dim=-1) #evidence eu_input = torch.cat((Hett, Heu, Hett - Heu, Hett * Heu), dim=-1) er_input = torch.cat((Hett, Her, Hett - Her, Hett * Her), dim=-1) #response ru_input = torch.cat((Hrtt, Hru, Hrtt - Hru, Hrtt * Hru), dim=-1) re_input = torch.cat((Hrtt, Hre, Hrtt - Hre, Hrtt * Hre), dim=-1) ue_input = self._projection(ue_input) ur_input = self._projection(ur_input) eu_input = self._projection(eu_input) er_input = self._projection(er_input) ru_input = self._projection(ru_input) re_input = self._projection(re_input) ue_output, _ = self.rnn2(ue_input) ur_output, _ = self.rnn2(ur_input) eu_output, _ = self.rnn2(eu_input) er_output, _ = self.rnn2(er_input) ru_output, _ = self.rnn2(ru_input) re_output, _ = self.rnn2(re_input) ''' ue_output= ue_output.masked_select(umask) ur_output= ur_output.masked_select(umask) eu_output= emask er_output= emask ru_output= rmask re_output= rmask ''' maxue, _ = ue_output.max(dim=1) maxur, _ = ur_output.max(dim=1) maxeu, _ = eu_output.max(dim=1) maxer, _ = er_output.max(dim=1) maxru, _ = ru_output.max(dim=1) maxre, _ = re_output.max(dim=1) umask = umask.sum(dim=1, keepdim=True) emask = emask.sum(dim=1, keepdim=True) rmask = rmask.sum(dim=1, keepdim=True) meanue = ue_output.sum(dim=1) / umask meanur = ur_output.sum(dim=1) / umask meaneu = eu_output.sum(dim=1) / emask meaner = er_output.sum(dim=1) / emask meanru = ru_output.sum(dim=1) / rmask meanre = re_output.sum(dim=1) / rmask v = torch.cat( [ maxue + maxur, meanue + meanur, maxeu + maxer, meaneu + meaner, maxru + maxre, meanru + meanre ], dim=1) # (bsz*max_utterances, channel, max_u_words, max_r_words) logits = self._classification(v) return logits.squeeze() def batch_att_cal(self, bertoutput, lenidx): batchsize = lenidx.shape[0] output = torch.zeros(batchsize) c_arr = torch.zeros((batchsize, 256, 768), dtype=torch.float32) e_arr = torch.zeros((batchsize, 250, 768), dtype=torch.float32) r_arr = torch.zeros((batchsize, 150, 768), dtype=torch.float32) c_mask = torch.zeros((batchsize, 256), dtype=torch.float32) e_mask = torch.zeros((batchsize, 250), dtype=torch.float32) r_mask = torch.zeros((batchsize, 150), dtype=torch.float32) #context = ho[0:lenidx[:][0]] for i in range(batchsize): c_arr[i, :lenidx[i][0] - 1] = bertoutput[i, 1:lenidx[i][0]] c_mask[i, :lenidx[i][0] - 1] = 1 e_arr[i, :lenidx[i][1] - lenidx[i][0]] = bertoutput[i, lenidx[i][0]:lenidx[i][1]] e_mask[i, :lenidx[i][1] - lenidx[i][0]] = 1 r_arr[i, :lenidx[i][2] - lenidx[i][1] - 1] = bertoutput[i, lenidx[i][1] + 1:lenidx[i][2]][:150] r_mask[i, :lenidx[i][2] - lenidx[i][1] - 1] = 1 c_arr, e_arr, r_arr = c_arr.cuda(), e_arr.cuda(), r_arr.cuda() c_mask, e_mask, r_mask = c_mask.cuda(), e_mask.cuda(), r_mask.cuda() logit = self.get_Matching_Map(c_arr, e_arr, r_arr, c_mask, e_mask, r_mask) ''' hc,c=self.rnn(c_arr) _,e=self.rnn(e_arr) hr, r = self.rnn(r_arr) ceattn = self.forward_attn(hc, e,c_mask) crattn = self.forward_attn(hc, r,c_mask) cattn = torch.cat([ceattn, crattn], dim=2) reattn = self.forward_attn(hr, e,r_mask) rcattn = self.forward_attn(hr, r,r_mask) rattn = torch.cat([reattn, rcattn], dim=2) output=self.bilinear(cattn, rattn) ''' return logit def train_step(self, i, data): with torch.no_grad(): batch_ids, batch_mask, batch_seg, batch_y, batch_len = (item.cuda( device=self.device) for item in data) self.optimizer.zero_grad() output, _ = self.bert_model(batch_ids, batch_mask, batch_seg) output = self.batch_att_cal(output, batch_len) logits = torch.sigmoid(output) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() if i % 100 == 0: print('Batch[{}] - loss: {:.6f} batch_size:{}'.format( i, loss.item(), batch_y.size(0))) # , accuracy, corrects return loss def fit(self, train, dev, train_evi, dev_evi): ############################여기가 메인임. if torch.cuda.is_available(): self.cuda() dataset = BERTDataset(self.args, train, train_evi, self.bert_tokenizer) sampler = RandomSampler(dataset) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, sampler=sampler) self.loss_func = nn.BCELoss() no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, correct_bias=True ) #weight_decay=self.args.l2_reg, correct_bias=False) for epoch in range(self.args.epochs): print("\nEpoch ", epoch + 1, "/", self.args.epochs) avg_loss = 0 self.train() for i, data in tqdm(enumerate(dataloader)): #원래 배치는 200 # torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0) if epoch >= 2 and self.patience >= 3: print("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) self.adjust_learning_rate() self.patience = 0 loss = self.train_step(i, data) if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) avg_loss += loss.item() cnt = len(train['y']) // self.args.batch_size + 1 print("Average loss:{:.6f} ".format(avg_loss / cnt)) self.evaluate(dev, dev_evi) def adjust_learning_rate(self, decay_rate=.5): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] print("Decay learning rate to: ", self.args.learning_rate) def evaluate(self, dev, dev_evi, is_test=False): y_pred = self.predict(dev, dev_evi) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, dev['y']): output.write(str(score) + '\t' + str(label) + '\n') result = self.metrics.evaluate_all_metrics() print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:", result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t", "R2:", result[4], "\t", "R5:", result[5]) if not is_test and result[3] + result[4] + result[5] > self.best_result[ 3] + self.best_result[4] + self.best_result[5]: print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:", self.best_result[1], "\t", "P@1:", self.best_result[2], "\t", "R1:", self.best_result[3], "\t", "R2:", self.best_result[4], "\t", "R5:", self.best_result[5]) self.patience = 0 self.best_result = result torch.save(self.state_dict(), self.args.save_path) print("save model!!!\n") else: self.patience += 1 def predict(self, dev, dev_evi): self.eval() y_pred = [] # for f in features: # print(f.input_ids) dataset = BERTDataset(self.args, dev, dev_evi, self.bert_tokenizer) dataloader = DataLoader(dataset, batch_size=128) for i, data in enumerate(dataloader): with torch.no_grad(): batch_ids, batch_mask, batch_seg, batch_y, batch_len = ( item.cuda() for item in data) with torch.no_grad(): output, _ = self.bert_model(batch_ids, batch_mask, batch_seg) output = self.batch_att_cal(output, batch_len) logits = torch.sigmoid(output) if i % 100 == 0: print('Batch[{}] batch_size:{}'.format( i, batch_ids.size(0))) # , accuracy, corrects y_pred += logits.data.cpu().numpy().tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) if torch.cuda.is_available(): self.cuda()
class NeuralNetwork(nn.Module): def __init__(self): super(NeuralNetwork, self).__init__() self.patience = 0 self.init_clip_max_norm = 5.0 self.optimizer = None self.best_result = [0, 0, 0, 0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.all_preds = [] def forward(self): raise NotImplementedError def train_step(self, i, data): with torch.no_grad(): batch_u, batch_r, batch_y = (item.to(self.device) for item in data) self.optimizer.zero_grad() logits = self.forward(batch_u, batch_r) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() print('Batch[{}] - loss: {:.6f} batch_size:{}'.format( i, loss.item(), batch_y.size(0))) # , accuracy, corrects return loss def fit(self, X_train_utterances, X_train_responses, y_train, X_dev_utterances, X_dev_responses, y_dev): self.to(self.device) dataset = DialogueDataset(X_train_utterances, X_train_responses, y_train) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True) self.loss_func = nn.BCELoss() self.optimizer = optim.Adam(self.parameters(), lr=self.args.learning_rate, weight_decay=self.args.l2_reg) for epoch in range(int(self.args.epochs)): print("\nEpoch ", epoch + 1, "/", self.args.epochs) avg_loss = 0 self.train() for i, data in enumerate(dataloader): loss = self.train_step(i, data) if i > 0 and i % 500 == 0: self.evaluate(X_dev_utterances, X_dev_responses, y_dev) self.train() if epoch >= 2 and self.patience >= 3: print("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) self.adjust_learning_rate() self.patience = 0 if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) avg_loss += loss.item() cnt = len(y_train) // self.args.batch_size + 1 print("Average loss:{:.6f} ".format(avg_loss / cnt)) self.evaluate(X_dev_utterances, X_dev_responses, y_dev) def adjust_learning_rate(self, decay_rate=.5): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] print("Decay learning rate to: ", self.args.learning_rate) def evaluate(self, X_dev_utterances, X_dev_responses, y_dev, is_test=False): y_pred = self.predict(X_dev_utterances, X_dev_responses) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, y_dev): output.write(str(score) + '\t' + str(label) + '\n') result = self.metrics.evaluate_all_metrics() print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:", result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t", "R2:", result[4], "\t", "R5:", result[5]) args = vars(self.args) if not os.path.isdir(args['output_predictions_folder']): os.makedirs(args['output_predictions_folder']) with open( os.path.join(args['output_predictions_folder'], 'config.json'), 'w') as f: args['ranker'] = "MSN" args['seed'] = str(args['seed']) args_dict = {} args_dict['args'] = args f.write(json.dumps(args_dict, indent=4, sort_keys=True)) df = pd.DataFrame(self.all_preds, columns=[ 'prediction_' + str(i) for i in range(len(self.all_preds[0])) ]) df.to_csv(args['output_predictions_folder'] + "/predictions.csv", index=False) if not is_test and result[3] + result[4] + result[5] > self.best_result[ 3] + self.best_result[4] + self.best_result[5]: print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:", self.best_result[1], "\t", "P@1:", self.best_result[2], "\t", "R1:", self.best_result[3], "\t", "R2:", self.best_result[4], "\t", "R5:", self.best_result[5]) self.patience = 0 self.best_result = result torch.save(self.state_dict(), self.args.save_path) print("save model!!!\n") else: self.patience += 1 def predict(self, X_dev_utterances, X_dev_responses): self.eval() y_pred = [] dataset = DialogueDataset(X_dev_utterances, X_dev_responses) dataloader = DataLoader(dataset, batch_size=51) for i, data in enumerate(dataloader): with torch.no_grad(): batch_u, batch_r = (item.to(self.device) for item in data) logits = self.forward(batch_u, batch_r) self.all_preds.append(logits.data.cpu().numpy().tolist()) y_pred += logits.data.cpu().numpy().tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) # if torch.cuda.is_available(): self.cuda() self.to(self.device)
class NeuralNetwork(nn.Module): def __init__(self, args): super(NeuralNetwork, self).__init__() self.args = args self.patience = 0 self.init_clip_max_norm = 5.0 self.optimizer = None self.best_result = [0, 0, 0, 0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') config_class, model_class, tokenizer_class = MODEL_CLASSES[ args.model_type] self.bert_config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, finetuning_task="classification", num_labels=1) self.bert_tokenizer = BertTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) special_tokens_dict = { 'eos_token': '[eos]', 'additional_special_tokens': ['[soe]', '[eoe]'] } num_added_toks = self.bert_tokenizer.add_special_tokens( special_tokens_dict) self.bert_model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=self.bert_config) self.bert_model.resize_token_embeddings(len(self.bert_tokenizer)) self.bert_model = self.bert_model.cuda() self.attn = nn.Linear(768, 768) self.rnn = nn.GRU(input_size=768, hidden_size=200, num_layers=1, batch_first=True, bidirectional=False) self.bilinear = nn.Bilinear(768, 768, 1) def forward(self): raise NotImplementedError def forward_attn(self, x1, x2): """ attention x1=T D x2=T D -> D """ max_len = x1.size(0) #T D x2 = x2.mean(dim=0) attn = self.attn(x1) # T,D attn_energies = attn.mm(x2.unsqueeze(1)) #T,D * D,1 --> T,1 alpha = F.softmax(attn_energies, dim=0) # T,1 alpha = alpha.transpose(0, 1) #1,T weighted_attn = alpha.mm(x1) # 1,T * T D= 1 D return weighted_attn def batch_att_cal(self, bertoutput, lenidx): #hid_out,_=self.rnn(bertoutput) batchsize = lenidx.shape[0] output = torch.zeros(batchsize) #context = ho[0:lenidx[:][0]] for i in range(batchsize): #context_evidence=bertoutput[i,1:lenidx[i][1]] context_evidence = torch.cat( (bertoutput[i, 1:lenidx[i][0]], bertoutput[i, lenidx[i][0] + 1:lenidx[i][1]]), dim=0) response = bertoutput[i, lenidx[i][1] + 1:lenidx[i][2]] ceattn = self.forward_attn(context_evidence, response) rattn = self.forward_attn(response, context_evidence) output[i] = self.bilinear(ceattn, rattn) #if torch.isnan(output[i])==True: # print("nan") return output.cuda() def train_step(self, i, data): with torch.no_grad(): batch_ids, batch_mask, batch_seg, batch_y, batch_len = (item.cuda( device=self.device) for item in data) self.optimizer.zero_grad() output, _ = self.bert_model(batch_ids, batch_mask, batch_seg) output = self.batch_att_cal(output, batch_len) logits = torch.sigmoid(output) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() if i % 100 == 0: print('Batch[{}] - loss: {:.6f} batch_size:{}'.format( i, loss.item(), batch_y.size(0))) # , accuracy, corrects return loss def fit(self, train, dev, train_evi, dev_evi): ############################여기가 메인임. if torch.cuda.is_available(): self.cuda() dataset = BERTDataset(self.args, train, train_evi, self.bert_tokenizer) sampler = RandomSampler(dataset) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, sampler=sampler) self.loss_func = nn.BCELoss() no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW( optimizer_grouped_parameters, lr=self.args.learning_rate, correct_bias=True ) #weight_decay=self.args.l2_reg, correct_bias=False) for epoch in range(self.args.epochs): print("\nEpoch ", epoch + 1, "/", self.args.epochs) avg_loss = 0 self.train() for i, data in tqdm(enumerate(dataloader)): #원래 배치는 200 # torch.nn.utils.clip_grad_norm_(self.parameters(), 1.0) if epoch >= 2 and self.patience >= 3: print("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) self.adjust_learning_rate() self.patience = 0 loss = self.train_step(i, data) if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) avg_loss += loss.item() cnt = len(train['y']) // self.args.batch_size + 1 print("Average loss:{:.6f} ".format(avg_loss / cnt)) self.evaluate(dev, dev_evi) def adjust_learning_rate(self, decay_rate=.5): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] print("Decay learning rate to: ", self.args.learning_rate) def evaluate(self, dev, dev_evi, is_test=False): y_pred = self.predict(dev, dev_evi) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, dev['y']): output.write(str(score) + '\t' + str(label) + '\n') result = self.metrics.evaluate_all_metrics() print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:", result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t", "R2:", result[4], "\t", "R5:", result[5]) if not is_test and result[3] + result[4] + result[5] > self.best_result[ 3] + self.best_result[4] + self.best_result[5]: print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:", self.best_result[1], "\t", "P@1:", self.best_result[2], "\t", "R1:", self.best_result[3], "\t", "R2:", self.best_result[4], "\t", "R5:", self.best_result[5]) self.patience = 0 self.best_result = result torch.save(self.state_dict(), self.args.save_path) print("save model!!!\n") else: self.patience += 1 def predict(self, dev, dev_evi): self.eval() y_pred = [] # for f in features: # print(f.input_ids) dataset = BERTDataset(self.args, dev, dev_evi, self.bert_tokenizer) dataloader = DataLoader(dataset, batch_size=128) for i, data in enumerate(dataloader): with torch.no_grad(): batch_ids, batch_mask, batch_seg, batch_y, batch_len = ( item.cuda() for item in data) with torch.no_grad(): output, _ = self.bert_model(batch_ids, batch_mask, batch_seg) output = self.batch_att_cal(output, batch_len) #for out in torch.isnan(output): # if out == True: # print(out) logits = torch.sigmoid(output) if i % 100 == 0: print('Batch[{}] batch_size:{}'.format( i, batch_ids.size(0))) # , accuracy, corrects y_pred += logits.data.cpu().numpy().tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) if torch.cuda.is_available(): self.cuda()
class NeuralNetwork(nn.Module): def __init__(self): super(NeuralNetwork, self).__init__() self.patience = 0 self.init_clip_max_norm = 5.0 self.optimizer = None self.best_result = [0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') def forward(self): raise NotImplementedError def train_step(self, data): with torch.no_grad(): batch_u, batch_r, batch_p, batch_y = (item.cuda(device=self.device) for item in data) self.optimizer.zero_grad() logits = self.forward(batch_u, batch_r, batch_p) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() # print('Batch[{}] - loss: {:.6f} batch_size:{}'.format(i, loss.item(), batch_y.size(0))) # , accuracy, corrects return loss, batch_y.size(0) def fit(self, X_train_utterances, X_train_responses, X_train_personas, y_train, X_dev_utterances, X_dev_responses, X_dev_personas, y_dev): if torch.cuda.is_available(): self.cuda() dataset = Dataset(X_train_utterances, X_train_responses, X_train_personas, y_train) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True) self.loss_func = nn.CrossEntropyLoss() self.optimizer = optim.AdamW(self.parameters(), lr=self.args.learning_rate) for epoch in range(self.args.epochs): self.epoch = epoch print("\nEpoch ", epoch + 1, "/", self.args.epochs) avg_loss = 0 self.train() with tqdm(total=len(y_train), ncols=90) as pbar: for i, data in enumerate(dataloader): loss, batch_size = self.train_step(data) pbar.set_postfix(lr=self.args.learning_rate, loss=loss.item()) if i > 0 and i % 500 == 0: self.evaluate(X_dev_utterances, X_dev_responses, X_dev_personas, y_dev) self.train() if epoch >= 1 and self.patience >= 3: # tqdm.write("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) self.adjust_learning_rate() self.patience = 0 if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) pbar.update(batch_size) avg_loss += loss.item() cnt = len(y_train) // self.args.batch_size + 1 tqdm.write("Average loss:{:.6f} ".format(avg_loss / cnt)) self.evaluate(X_dev_utterances, X_dev_responses, X_dev_personas, y_dev) tqdm.write("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" % (self.best_result[0], self.best_result[1], self.best_result[2])) def adjust_learning_rate(self, decay_rate=0.5): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] # tqdm.write("Decay learning rate to: " + str(self.args.learning_rate)) def evaluate(self, X_dev_utterances, X_dev_responses, X_dev_personas, y_dev, is_test=False): y_pred = self.predict(X_dev_utterances, X_dev_responses, X_dev_personas) y_dev_one_hot = np.zeros((len(y_dev), 20), dtype=np.int) for i in range(len(y_dev)): y_dev_one_hot[i][y_dev[i]] = 1 y_dev_one_hot = y_dev_one_hot.reshape(-1) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, y_dev_one_hot): output.write(str(score) + '\t' + str(label) + '\n') result = self.metrics.evaluate_all_metrics() if not is_test and result[0] + result[1] + result[2] > self.best_result[ 0] + self.best_result[1] + self.best_result[2]: # tqdm.write("save model!!!") self.best_result = result tqdm.write("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" % (self.best_result[0], self.best_result[1], self.best_result[2])) self.logger.info("Best Result: R@1: %.3f R@2: %.3f R@5: %.3f" % (self.best_result[0], self.best_result[1], self.best_result[2])) self.patience = 0 torch.save(self.state_dict(), self.args.save_path) else: self.patience += 1 if is_test: print("Evaluation Result: R@1: %.3f R@2: %.3f R@5: %.3f" % (result[0], result[1], result[2])) def predict(self, X_dev_utterances, X_dev_responses, X_dev_personas): self.eval() y_pred = [] dataset = Dataset(X_dev_utterances, X_dev_responses, X_dev_personas) dataloader = DataLoader(dataset, batch_size=self.args.batch_size) with torch.no_grad(): for i, data in enumerate(dataloader): batch_u, batch_r, batch_l = (item.cuda() for item in data) logits = self.forward(batch_u, batch_r, batch_l) y_pred.append(logits.data.cpu().numpy().reshape(-1)) y_pred = np.concatenate(y_pred, axis=0).tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) if torch.cuda.is_available(): self.cuda()
class NeuralNetwork(nn.Module): def __init__(self): super(NeuralNetwork, self).__init__() self.patience = 0 self.init_clip_max_norm = 10.0 self.optimizer = None self.best_result = [0, 0, 0, 0, 0, 0] self.metrics = Metrics(self.args.score_file_path) self.device = torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu') def forward(self): raise NotImplementedError def train_step(self, i, data): with torch.no_grad(): batch_u, batch_r, batch_key_r, batch_key_mask_r, batch_y = ( item.cuda(device=self.device) for item in data) self.optimizer.zero_grad() logits = self.forward(batch_u, batch_r, batch_key_r, batch_key_mask_r) loss = self.loss_func(logits, target=batch_y) loss.backward() self.optimizer.step() if i % 100 == 0: print('Batch[{}] - loss: {:.6f} batch_size:{}'.format( i, loss.item(), batch_y.size(0))) # , accuracy, corrects return loss def fit(self, X_train_utterances, X_train_responses, y_train, X_dev_utterances, X_dev_responses, y_dev, key_r, key_mask_r, dev_key_r, dev_key_mask_r): if torch.cuda.is_available(): self.cuda() dataset = DialogueDataset(X_train_utterances, X_train_responses, key_r, key_mask_r, y_train) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True) self.loss_func = nn.BCELoss() self.optimizer = optim.Adam(self.parameters(), lr=self.args.learning_rate, weight_decay=self.args.l2_reg) for epoch in range(self.args.epochs): print("\nEpoch ", epoch + 1, "/", self.args.epochs) avg_loss = 0 self.train() for i, data in enumerate(dataloader): if epoch >= 2 and self.patience >= 2: print("Reload the best model...") self.load_state_dict(torch.load(self.args.save_path)) self.adjust_learning_rate() self.patience = 0 loss = self.train_step(i, data) if i > 0 and i % 1000000 == 0: self.evaluate(X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r, y_dev) self.train() if self.init_clip_max_norm is not None: utils.clip_grad_norm_(self.parameters(), max_norm=self.init_clip_max_norm) avg_loss += loss.item() cnt = len(y_train) // self.args.batch_size + 1 print("Average loss:{:.6f} ".format(avg_loss / cnt)) self.evaluate(X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r, y_dev) def adjust_learning_rate(self, decay_rate=.5): for param_group in self.optimizer.param_groups: param_group['lr'] = param_group['lr'] * decay_rate self.args.learning_rate = param_group['lr'] print("Decay learning rate to: ", self.args.learning_rate) def evaluate(self, X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r, y_dev, is_test=False): y_pred = self.predict(X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r) with open(self.args.score_file_path, 'w') as output: for score, label in zip(y_pred, y_dev): output.write(str(score) + '\t' + str(label) + '\n') result = self.metrics.evaluate_all_metrics() print("Evaluation Result: \n", "MAP:", result[0], "\t", "MRR:", result[1], "\t", "P@1:", result[2], "\t", "R1:", result[3], "\t", "R2:", result[4], "\t", "R5:", result[5]) if not is_test and result[3] + result[4] + result[5] > self.best_result[ 3] + self.best_result[4] + self.best_result[5]: print("Best Result: \n", "MAP:", self.best_result[0], "\t", "MRR:", self.best_result[1], "\t", "P@1:", self.best_result[2], "\t", "R1:", self.best_result[3], "\t", "R2:", self.best_result[4], "\t", "R5:", self.best_result[5]) self.patience = 0 self.best_result = result torch.save(self.state_dict(), self.args.save_path) print("save model!!!\n") else: self.patience += 1 def predict(self, X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r): self.eval() y_pred = [] dataset = DialogueDataset(X_dev_utterances, X_dev_responses, dev_key_r, dev_key_mask_r) dataloader = DataLoader(dataset, batch_size=400) for i, data in enumerate(dataloader): with torch.no_grad(): batch_u, batch_r, batch_key_r, batch_key_masked_r = ( item.cuda() for item in data) logits = self.forward(batch_u, batch_r, batch_key_r, batch_key_masked_r) y_pred += logits.data.cpu().numpy().tolist() return y_pred def load_model(self, path): self.load_state_dict(state_dict=torch.load(path)) if torch.cuda.is_available(): self.cuda()