def act(self): reply = {} if self.args.eval_type == "hits@1" and len(self.candidates) > 0: instances = defaultdict(list) for candidate, _ in self.candidates: instance, _ = build_input_from_segments( self.persona, self.history, candidate, self.tokenizer) for input_name, input_array in instance.items(): instances[input_name].append(input_array) inputs = pad_dataset(instances, padding=self.special_tokens_ids[-1]) tensor_inputs = {} for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]: tensor = torch.tensor(inputs[input_name], device=self.args.device) tensor = tensor.view((-1, len(self.candidates)) + tensor.shape[1:]) tensor_inputs[input_name] = tensor with torch.no_grad(): _, mc_logits = self.model_checkpoint(**tensor_inputs) val, ind = torch.sort(mc_logits[0], descending=True) ypred = self.candidates[ind[0].item()][1] # match tc = [] for j in range(len(self.candidates)): tc.append(self.candidates[ind[j].item()][1]) reply = {'text': ypred, 'text_candidates': tc} else: # We are in interactive of f1 evaluation mode => just sample with torch.no_grad(): out_ids = sample_sequence(self.persona, self.history, self.tokenizer, self.model_checkpoint, self.args) # YW: TODO: out_ids, _? out_text = self.tokenizer.decode( out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=(self.args.eval_type != 'f1')) # print('out_text:', out_text) reply = {'text': out_text} return reply
def act(self): reply = {} if self.args.eval_type == "hits@1" and len(self.candidates) > 0: instances = defaultdict(list) for candidate, _ in self.candidates: instance = build_input_from_segments(self.persona, self.history, candidate, self.tokenizer) for input_name, input_array in instance.items(): instances[input_name].append(input_array) inputs = pad_dataset(instances, padding=self.special_tokens_ids[-1]) tensor_inputs = {} for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]: tensor = torch.tensor(inputs[input_name], device=self.args.device) tensor = tensor.view((-1, len(self.candidates)) + tensor.shape[1:]) tensor_inputs[input_name] = tensor with torch.no_grad(): mc_logits = self.model_checkpoint(**tensor_inputs)[1] val, ind = torch.sort(mc_logits[0], descending=True) ypred = self.candidates[ind[0].item()][1] # match tc = [] for j in range(len(self.candidates)): tc.append(self.candidates[ind[j].item()][1]) reply = {'text': ypred, 'text_candidates': tc} elif self.args.eval_type == 'f1': # We are in interactive of f1 evaluation mode => just sample with torch.no_grad(): out_ids = sample_sequence(self.persona, self.history, self.history_wd, self.tokenizer, self.model_checkpoint, self.args) out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=(self.args.eval_type != 'f1')) reply = {'text': out_text} # integrate it LSM Metrics, during sampling with F1 eval type (decreases validation commands). # For every response to a given dialogue history, we calculate the LSM score # This score is added to a total list of scores and meaned at the end # We separate a model response with a label response, so that results in 2 LSM scores. response = reply['text'] dialogue_hist = self.history_wd table = str.maketrans(dict.fromkeys(string.punctuation)) # split history into history of speaker 2 and speaker 1 speaker2_hist_string, speaker1_chatbot_hist_list = split_history(dialogue_hist, self.tokenizer) # use deepcopy to avoid variable troubles label_c_hist = deepcopy(speaker1_chatbot_hist_list) # add response generated by model speaker1_chatbot_hist_list.append(response) # add label to chatbot hist label_c_hist.append(self.tokenizer.decode(self.labels)) # convert to strings pred_c_hist_string = (' ' + ' '.join(speaker1_chatbot_hist_list).lower() + ' ').replace("'", ' ').translate( table) label_c_hist_string = (' ' + ' '.join(label_c_hist).lower() + ' ').replace("'", ' ').translate(table) # results in two vectors containing the function word usage percentage for each category # we use prediction and labeled, so we get 2 LSM score eventually at the end of evaluation _, p1_model, p2_model = calc_fw_perc_diffs(self.d, pred_c_hist_string, speaker2_hist_string) _, p1_human, p2_human = calc_fw_perc_diffs(self.d, label_c_hist_string, speaker2_hist_string) # calculate LSM score for the model response and calculate LSM score for the label response LSMs_model = torch.tensor([1-(abs(p1 - p2_model[i]) / (p1 + p2_model[i] + 0.00000001)) for i, p1 in enumerate(p1_model)]).cuda() LSMs_human = torch.tensor([1-(abs(p1 - p2_human[i]) / (p1 + p2_human[i] + 0.00000001)) for i, p1 in enumerate(p1_human)]).cuda() lsm_model_list.append(torch.mean(LSMs_model)) lsm_human_list.append(torch.mean(LSMs_human)) else: # We are in interactive of f1 evaluation mode => just sample with torch.no_grad(): out_ids = sample_sequence(self.persona, self.history, self.history_wd, self.tokenizer, self.model_checkpoint, self.args) out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=(self.args.eval_type != 'f1')) reply = {'text': out_text} return reply
def act(self): reply = {} if self.args.eval_type == "hits@1" and len(self.candidates) > 0: instances = defaultdict(list) for candidate, _ in self.candidates: instance, _ = build_input_from_segments(self.persona, self.history, candidate, self.tokenizer) for input_name, input_array in instance.items(): instances[input_name].append(input_array) inputs = pad_dataset(instances, padding=self.special_tokens_ids[-1]) tensor_inputs = {} for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]: tensor = torch.tensor(inputs[input_name], device=self.args.device) tensor = tensor.view((-1, len(self.candidates)) + tensor.shape[1:]) tensor_inputs[input_name] = tensor with torch.no_grad(): _, mc_logits = self.model_checkpoint(**tensor_inputs) val, ind = torch.sort(mc_logits[0], descending=True) ypred = self.candidates[ind[0].item()][1] # match tc = [] for j in range(len(self.candidates)): tc.append(self.candidates[ind[j].item()][1]) reply = {'text': ypred, 'text_candidates': tc} else: # We are in interactive of f1 evaluation mode => just sample with torch.no_grad(): out_ids = sample_sequence(self.persona, self.history, self.tokenizer, self.model_checkpoint, self.args) # YW: TODO: out_ids, _? # Get a generated response out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=(self.args.eval_type != 'f1')) out_text_org = out_text out_text = out_text.replace(' \' ', '\'') # TODO: tbd out_text = out_text.replace(' \'', '\'') # persona NLI profiles = [] for profile in self.persona: profile_text = self.tokenizer.decode(profile, skip_special_tokens=True, clean_up_tokenization_spaces=False) profile_text = profile_text.replace(' \' ', '\'') # TODO: tbd profile_text = profile_text.replace(' \'', '\'') profiles.append(profile_text) nli_score, reward_score, c_score, current_con_en = nli_engine(out_text, profiles, self.nli_tokenizer, self.nli_model, eval=True) self.nli_scores += nli_score # persona NLI self.reward_scores += reward_score # reward function self.c_scores += c_score # C score self.sample_num += 1 self.con_en += current_con_en # if this persona contains a contradicted/entail profile or not (not applied) # internal repetition response_tok = out_text_org.split() intrep_1gram = intrep_frac(response_tok) # if 2-gram or 3-gram are going to be used: '''' # intrep_2gram response_tok_2gram = get_ngrams(out_text, 2) intrep_2gram = intrep_frac(response_tok_2gram) # intrep_3gram response_tok_3gram = get_ngrams(out_text, 3) intrep_3gram = intrep_frac(response_tok_3gram) ''' intern_rep_reward = intrep_1gram self.intrep_scores += intern_rep_reward # bleu label_text = self.tokenizer.decode(self.labels, skip_special_tokens=True, clean_up_tokenization_spaces=False) current_bleu = bleu_rewarder(out_text_org, label_text) self.bleu_scores += current_bleu # fine-tuned GPT-based language model lm_tokenize_input = self.lm_tokenizer.tokenize(out_text) # lm_tensor_input = torch.tensor([lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input)]).to(args.device) lm_tensor_input = torch.tensor([[self.special_tokens_ids[0]] + self.lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input) + [self.special_tokens_ids[-1]]]).to(self.args.device) lm_loss = self.lm_model(lm_tensor_input, lm_labels=lm_tensor_input) lm_ppl = math.exp(lm_loss.item()) self.lm_ppl_scores += lm_ppl print('out_text:', out_text) print('current nli:', self.nli_scores) print('current score:', self.reward_scores / self.sample_num) print('current c_score_macro:', self.c_scores / self.sample_num) current_c_score_micro = (self.nli_scores[1] - self.nli_scores[0]) / sum(self.nli_scores) cn_res = nli_score[1] - nli_score[0] # cn: C_new (persona level) # C_new calculation if cn_res > 0: current_cn = 1 elif cn_res < 0: current_cn = -1 else: current_cn = 0 self.cnm += current_cn print('current c_new:', self.cnm / self.sample_num) print('current c_score_micro:', current_c_score_micro) print('current con_en:', self.con_en) print('current intrep score:', self.intrep_scores / self.sample_num) print('current BLEU:', self.bleu_scores / self.sample_num) print('current PPL:', self.lm_ppl_scores / self.sample_num) reply = {'text': out_text} return reply