Beispiel #1
0
    def act(self):
        reply = {}

        if self.args.eval_type == "hits@1" and len(self.candidates) > 0:
            instances = defaultdict(list)
            for candidate, _ in self.candidates:
                instance, _ = build_input_from_segments(
                    self.persona, self.history, candidate, self.tokenizer)
                for input_name, input_array in instance.items():
                    instances[input_name].append(input_array)

            inputs = pad_dataset(instances,
                                 padding=self.special_tokens_ids[-1])

            tensor_inputs = {}
            for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]:
                tensor = torch.tensor(inputs[input_name],
                                      device=self.args.device)
                tensor = tensor.view((-1, len(self.candidates)) +
                                     tensor.shape[1:])
                tensor_inputs[input_name] = tensor

            with torch.no_grad():
                _, mc_logits = self.model_checkpoint(**tensor_inputs)

            val, ind = torch.sort(mc_logits[0], descending=True)

            ypred = self.candidates[ind[0].item()][1]  # match
            tc = []
            for j in range(len(self.candidates)):
                tc.append(self.candidates[ind[j].item()][1])
            reply = {'text': ypred, 'text_candidates': tc}
        else:
            # We are in interactive of f1 evaluation mode => just sample
            with torch.no_grad():
                out_ids = sample_sequence(self.persona, self.history,
                                          self.tokenizer,
                                          self.model_checkpoint,
                                          self.args)  # YW: TODO: out_ids, _?
            out_text = self.tokenizer.decode(
                out_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=(self.args.eval_type != 'f1'))
            # print('out_text:', out_text)
            reply = {'text': out_text}

        return reply
    def act(self):
        reply = {}

        if self.args.eval_type == "hits@1" and len(self.candidates) > 0:
            instances = defaultdict(list)
            for candidate, _ in self.candidates:
                instance = build_input_from_segments(self.persona, self.history, candidate, self.tokenizer)
                for input_name, input_array in instance.items():
                    instances[input_name].append(input_array)

            inputs = pad_dataset(instances, padding=self.special_tokens_ids[-1])

            tensor_inputs = {}
            for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]:
                tensor = torch.tensor(inputs[input_name], device=self.args.device)
                tensor = tensor.view((-1, len(self.candidates)) + tensor.shape[1:])
                tensor_inputs[input_name] = tensor

            with torch.no_grad():
                mc_logits = self.model_checkpoint(**tensor_inputs)[1]

            val, ind = torch.sort(mc_logits[0], descending=True)

            ypred = self.candidates[ind[0].item()][1] # match
            tc = []
            for j in range(len(self.candidates)):
                tc.append(self.candidates[ind[j].item()][1])
            reply = {'text': ypred, 'text_candidates': tc}


        elif self.args.eval_type == 'f1':
            # We are in interactive of f1 evaluation mode => just sample
            with torch.no_grad():
                out_ids = sample_sequence(self.persona, self.history, self.history_wd, self.tokenizer, self.model_checkpoint, self.args)
            out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True,
                                             clean_up_tokenization_spaces=(self.args.eval_type != 'f1'))
            reply = {'text': out_text}

            # integrate it LSM Metrics, during sampling with F1 eval type (decreases validation commands).
            # For every response to a given dialogue history, we calculate the LSM score
            # This score is added to a total list of scores and meaned at the end
            # We separate a model response with a label response, so that results in 2 LSM scores.
            response = reply['text']
            dialogue_hist = self.history_wd

            table = str.maketrans(dict.fromkeys(string.punctuation))

            # split history into history of speaker 2 and speaker 1
            speaker2_hist_string, speaker1_chatbot_hist_list = split_history(dialogue_hist, self.tokenizer)

            # use deepcopy to avoid variable troubles
            label_c_hist = deepcopy(speaker1_chatbot_hist_list)

            # add response generated by model
            speaker1_chatbot_hist_list.append(response)

            # add label to chatbot hist
            label_c_hist.append(self.tokenizer.decode(self.labels))

            # convert to strings
            pred_c_hist_string = (' ' + ' '.join(speaker1_chatbot_hist_list).lower() + ' ').replace("'", ' ').translate(
                table)
            label_c_hist_string = (' ' + ' '.join(label_c_hist).lower() + ' ').replace("'", ' ').translate(table)

            # results in two vectors containing the function word usage percentage for each category
            # we use prediction and labeled, so we get 2 LSM score eventually at the end of evaluation
            _, p1_model, p2_model = calc_fw_perc_diffs(self.d, pred_c_hist_string, speaker2_hist_string)
            _, p1_human, p2_human = calc_fw_perc_diffs(self.d, label_c_hist_string, speaker2_hist_string)

            # calculate LSM score for the model response and calculate LSM score for the label response
            LSMs_model = torch.tensor([1-(abs(p1 - p2_model[i]) / (p1 + p2_model[i] + 0.00000001)) for i, p1 in enumerate(p1_model)]).cuda()
            LSMs_human = torch.tensor([1-(abs(p1 - p2_human[i]) / (p1 + p2_human[i] + 0.00000001)) for i, p1 in enumerate(p1_human)]).cuda()

            lsm_model_list.append(torch.mean(LSMs_model))
            lsm_human_list.append(torch.mean(LSMs_human))

        else:
            # We are in interactive of f1 evaluation mode => just sample
            with torch.no_grad():
                out_ids = sample_sequence(self.persona, self.history, self.history_wd, self.tokenizer,
                                          self.model_checkpoint, self.args)
            out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True,
                                             clean_up_tokenization_spaces=(self.args.eval_type != 'f1'))
            reply = {'text': out_text}


        return reply
    def act(self):
        reply = {}

        if self.args.eval_type == "hits@1" and len(self.candidates) > 0:
            instances = defaultdict(list)
            for candidate, _ in self.candidates:
                instance, _ = build_input_from_segments(self.persona, self.history, candidate, self.tokenizer)
                for input_name, input_array in instance.items():
                    instances[input_name].append(input_array)

            inputs = pad_dataset(instances, padding=self.special_tokens_ids[-1])

            tensor_inputs = {}
            for input_name in ["input_ids", "mc_token_ids", "token_type_ids"]:
                tensor = torch.tensor(inputs[input_name], device=self.args.device)
                tensor = tensor.view((-1, len(self.candidates)) + tensor.shape[1:])
                tensor_inputs[input_name] = tensor

            with torch.no_grad():
                _, mc_logits = self.model_checkpoint(**tensor_inputs)

            val, ind = torch.sort(mc_logits[0], descending=True)

            ypred = self.candidates[ind[0].item()][1] # match
            tc = []
            for j in range(len(self.candidates)):
                tc.append(self.candidates[ind[j].item()][1])
            reply = {'text': ypred, 'text_candidates': tc}
        else:
            # We are in interactive of f1 evaluation mode => just sample
            with torch.no_grad():
                out_ids = sample_sequence(self.persona, self.history, self.tokenizer, self.model_checkpoint, self.args)   # YW: TODO: out_ids, _?
            # Get a generated response
            out_text = self.tokenizer.decode(out_ids, skip_special_tokens=True,
                                             clean_up_tokenization_spaces=(self.args.eval_type != 'f1'))
            out_text_org = out_text
            out_text = out_text.replace(' \' ', '\'')   # TODO: tbd
            out_text = out_text.replace(' \'', '\'')
            # persona NLI
            profiles = []
            for profile in self.persona:
                profile_text = self.tokenizer.decode(profile, skip_special_tokens=True, clean_up_tokenization_spaces=False)
                profile_text = profile_text.replace(' \' ', '\'')   # TODO: tbd
                profile_text = profile_text.replace(' \'', '\'')
                profiles.append(profile_text)
            nli_score, reward_score, c_score, current_con_en = nli_engine(out_text, profiles, self.nli_tokenizer, self.nli_model, eval=True)
            self.nli_scores += nli_score   # persona NLI
            self.reward_scores += reward_score   # reward function
            self.c_scores += c_score   # C score
            self.sample_num += 1
            self.con_en += current_con_en   # if this persona contains a contradicted/entail profile or not (not applied)

            # internal repetition
            response_tok = out_text_org.split()
            intrep_1gram = intrep_frac(response_tok)
            # if 2-gram or 3-gram are going to be used:
            ''''
            # intrep_2gram
            response_tok_2gram = get_ngrams(out_text, 2)
            intrep_2gram = intrep_frac(response_tok_2gram)
            # intrep_3gram
            response_tok_3gram = get_ngrams(out_text, 3)
            intrep_3gram = intrep_frac(response_tok_3gram)
            '''
            intern_rep_reward = intrep_1gram
            self.intrep_scores += intern_rep_reward

            # bleu
            label_text = self.tokenizer.decode(self.labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            current_bleu = bleu_rewarder(out_text_org, label_text)
            self.bleu_scores += current_bleu

            # fine-tuned GPT-based language model
            lm_tokenize_input = self.lm_tokenizer.tokenize(out_text)
            # lm_tensor_input = torch.tensor([lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input)]).to(args.device)
            lm_tensor_input = torch.tensor([[self.special_tokens_ids[0]] + self.lm_tokenizer.convert_tokens_to_ids(lm_tokenize_input) + [self.special_tokens_ids[-1]]]).to(self.args.device)
            lm_loss = self.lm_model(lm_tensor_input, lm_labels=lm_tensor_input)
            lm_ppl = math.exp(lm_loss.item())
            self.lm_ppl_scores += lm_ppl

            print('out_text:', out_text)
            print('current nli:', self.nli_scores)
            print('current score:', self.reward_scores / self.sample_num)
            print('current c_score_macro:', self.c_scores / self.sample_num)
            current_c_score_micro = (self.nli_scores[1] - self.nli_scores[0]) / sum(self.nli_scores)
            cn_res = nli_score[1] - nli_score[0]   # cn: C_new (persona level)
            # C_new calculation
            if cn_res > 0:
                current_cn = 1
            elif cn_res < 0:
                current_cn = -1
            else:
                current_cn = 0
            self.cnm += current_cn
            print('current c_new:', self.cnm / self.sample_num)
            print('current c_score_micro:', current_c_score_micro)
            print('current con_en:', self.con_en)
            print('current intrep score:', self.intrep_scores / self.sample_num)
            print('current BLEU:', self.bleu_scores / self.sample_num)
            print('current PPL:', self.lm_ppl_scores / self.sample_num)
            reply = {'text': out_text}

        return reply