Beispiel #1
0
def main(argv):
  parser = argparse.ArgumentParser(description='Evaluate the system outputs.')

  parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', choices=['train', 'val', 'test'],
                      default='train', help='The dataset to analyze')
  parser.add_argument('--dataroot', dest='dataroot', action='store', metavar='PATH', default='data_modify/add_stop',
                      help='Will look for corpus in <dataroot>/<dataset>/...')
  parser.add_argument("--knowledge_file", type=str, default="knowledge.json",
                      help="knowledge file name.")

  args = parser.parse_args()

  # data = DatasetWalker(dataroot=args.dataroot, dataset=args.dataset, labels=False)
  knowledge_reader = KnowledgeReader(dataroot=args.dataroot, knowledge_file=args.knowledge_file)
  # beam_size = len(output[0]['beam_outputs'])

  with open(os.path.join(args.dataroot, args.dataset, 'logs.json'), 'r') as f:
    logs = json.load(f)
  with open(os.path.join(args.dataroot, args.dataset, 'labels.json'), 'r') as f:
    labels = json.load(f)

  count_1 = 0
  new_logs = []
  new_labels_pre = []
  new_labels_post = []
  for log, label in zip(logs, labels):
    if label['target']:
      response = label['response']
      ref_text = knowledge_reader.get_doc(**label['knowledge'][0])['doc']['body']
      candidate_text_list = splitSentence(response)
      if len(candidate_text_list) > 1:
        candidate_text_list = [' '.join(candidate_text_list[:i]) for i in range(1, len(candidate_text_list))]
        candidate_text_list_med = [Levenshtein.distance(ref_text, candidate_text) for candidate_text in candidate_text_list]
        candidate_text_after = candidate_text_list[int(np.argmin(candidate_text_list_med))]
        post_txt = response[len(candidate_text_after) + 1:].strip()
        pre_txt = candidate_text_after
        pre_label = label.copy()
        pre_label['response'] = pre_txt
        post_label = label.copy()
        post_label['response'] = post_txt
        new_logs.append(log)
        new_labels_pre.append(pre_label)
        new_labels_post.append(post_label)

  pre_path = os.path.join(args.dataroot, 'pre_response', args.dataset)
  post_path = os.path.join(args.dataroot, 'post_response', args.dataset)
  if not os.path.exists(pre_path):
    os.makedirs(pre_path)
  if not os.path.exists(post_path):
    os.makedirs(post_path)

  with open(os.path.join(pre_path, 'labels.json'), 'w') as fout:
    json.dump(new_labels_pre, fout, indent=2)
  with open(os.path.join(post_path, 'labels.json'), 'w') as fout:
    json.dump(new_labels_post, fout, indent=2)

  with open(os.path.join(pre_path, 'logs.json'), 'w') as fout:
    json.dump(new_logs, fout, indent=2)
  with open(os.path.join(post_path, 'logs.json'), 'w') as fout:
    json.dump(new_logs, fout, indent=2)
Beispiel #2
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES
        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations()

        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()
Beispiel #3
0
def read_knowledge():
    knowledge_reader = KnowledgeReader(args.kw_path,'knowledge.json' )
    knowledge = knowledge_reader.knowledge
    knowledge_docs = knowledge_reader.get_doc_list()
    snippets = dict()
    for snippet in knowledge_docs:
        key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"])
        knowledge = snippet["doc"]["body"]
        snippets[key] = knowledge
    return snippets
Beispiel #4
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        # when train the model labels==True
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['cls_token'])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])
        #         self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token'])

        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations(
        )  # get the parsed dialog data from dataset_walker
        # print("dialogs: ",self.dialogs[0])
        '''eg.
          [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, 
          {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}]
          '''
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()
Beispiel #5
0
class BaseDataset(torch.utils.data.Dataset):
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES
        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.all_response_tokenized = []
        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations()
        self.all_response_tokenized = list(
            map(eval, set(map(str, self.all_response_tokenized))))

        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()

    def _prepare_conversations(self):
        logger.info("Tokenize and encode the dialog data")
        tokenized_dialogs = []
        for i, (log, label) in enumerate(
                tqdm(self.dataset_walker,
                     disable=self.args.local_rank
                     not in [-1, 0])):  # only show progress bar in one process
            dialog = {}
            dialog["id"] = i
            dialog["log"] = log
            if label is not None:
                if "response" in label:
                    label[
                        "response_tokenized"] = self.tokenizer.convert_tokens_to_ids(
                            self.tokenizer.tokenize(label["response"]))
                    self.all_response_tokenized.append(
                        label["response_tokenized"])
            dialog["label"] = label
            tokenized_dialogs.append(dialog)
        return tokenized_dialogs

    def _prepare_knowledge(self):
        knowledge = self.knowledge_reader.knowledge
        self.knowledge_docs = self.knowledge_reader.get_doc_list()

        tokenized_snippets = dict()
        for snippet in self.knowledge_docs:
            key = "{}__{}__{}".format(snippet["domain"],
                                      str(snippet["entity_id"]) or "",
                                      snippet["doc_id"])
            knowledge = self._knowledge_to_string(snippet["doc"],
                                                  name=snippet["entity_name"]
                                                  or "")
            tokenized_knowledge = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(knowledge))
            tokenized_snippets[key] = tokenized_knowledge[:self.args.
                                                          knowledge_max_tokens]
        return knowledge, tokenized_snippets

    def _knowledge_to_string(self, doc, name=""):
        return doc["body"]

    def _create_examples(self):
        logger.info("Creating examples")
        self.examples = []
        for dialog in tqdm(self.dialogs,
                           disable=self.args.local_rank not in [-1, 0]):
            dialog_id = dialog["id"]
            label = dialog["label"]
            dialog = dialog["log"]
            if label is None:
                # This will only happen when running knowledge-seeking turn detection on test data
                # So we create dummy target here
                label = {"target": False}

            target = label["target"]

            if not target and self.args.task != "detection":
                # we only care about non-knowledge-seeking turns in turn detection task
                continue

            history = [
                self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(turn["text"])) for turn in dialog
            ]
            gt_resp = label.get("response", "")
            tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(gt_resp))

            # apply history threshold at an utterance-level (a large value can be used to nullify its effect)
            truncated_history = history[-self.args.history_max_utterances:]

            # perform token-level truncation of history from the left
            truncated_history = truncate_sequences(
                truncated_history, self.args.history_max_tokens)

            if target:
                if "knowledge" not in label:
                    # when the labels.json is from knowledge-seeking turn detection,
                    # there will be no ground truth knowledge
                    # so we just use a dummy snippet here
                    if not self.args.eval_all_snippets:
                        raise ValueError(
                            "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection"
                        )
                    label["knowledge"] = [self.knowledge_docs[0]]

                knowledge = label["knowledge"][0]
                knowledge_key = "{}__{}__{}".format(knowledge["domain"],
                                                    knowledge["entity_id"],
                                                    knowledge["doc_id"])
                # find snippets with same entity as candidates
                prefix = "{}__{}".format(knowledge["domain"],
                                         knowledge["entity_id"])
                knowledge_candidates = [
                    cand for cand in self.snippets.keys()
                    if cand.startswith(prefix)
                ]
                if self.split_type == "train" and self.args.negative_sample_method == "oracle":
                    # if there's not enough candidates during training, we just skip this example
                    if len(knowledge_candidates) < self.args.n_candidates:
                        continue
                used_knowledge = self.snippets[knowledge_key]
                used_knowledge = used_knowledge[:self.args.
                                                knowledge_max_tokens]
            else:
                knowledge_candidates = None
                used_knowledge = []

            if target and self.args.__dict__.get("n_response_candidates",
                                                 1) > 1:
                response_candidates = self.all_response_tokenized
            else:
                response_candidates = None

            self.examples.append({
                "history": truncated_history,
                "knowledge": used_knowledge,
                "candidates": knowledge_candidates,
                "response": tokenized_gt_resp,
                "response_candidates": response_candidates,
                "response_text": gt_resp,
                "label": label,
                "knowledge_seeking": target,
                "dialog_id": dialog_id
            })

    def build_input_from_segments(self,
                                  knowledge,
                                  history,
                                  response,
                                  with_eos=True):
        """ Build a sequence of input from 3 segments: knowledge, history and last reply """
        instance = {}

        sequence = [[self.bos] + knowledge] + history + [
            response + ([self.eos] if with_eos else [])
        ]
        sequence_with_speaker = [
            [self.speaker1 if
             (len(sequence) - i) % 2 == 0 else self.speaker2] + s
            for i, s in enumerate(sequence[1:])
        ]
        sequence = [sequence[0]] + sequence_with_speaker
        instance["input_ids"] = list(chain(*sequence))
        instance["token_type_ids"] = [
            self.speaker2 if i % 2 else self.speaker1
            for i, s in enumerate(sequence) for _ in s
        ]
        instance["mc_token_ids"] = len(instance["input_ids"]) - 1
        instance["lm_labels"] = (
            [-100] * sum(len(s)
                         for s in sequence[:-1])) + [-100] + sequence[-1][1:]

        return instance, sequence

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        return len(self.examples)
Beispiel #6
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        # Bert special tokens
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["cls_token"])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])

        # PAD modified
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        # dataset_walker.py
        #   self.logs: logs.json
        #   self.labels: labels.json

        ## if labels_file passed in, use the output of task1 (baseline.ktd.json)
        ## only has target: True / False
        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)

        # self.dialogs: list of dictionary
        #   for train_baseline:
        #       format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},}
        #                {...},
        #                {...}]
        #       e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}}

        ##  for run_baseline: 'label' only has 'target'
        ##      format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},}
        ##               {...},
        ##               {...}]
        self.dialogs = self._prepare_conversations()

        # knowledge_reader.py
        #   self.knowledge: knowledge.json
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)

        # self.snippets: dictionary
        #   format: {key: value}
        #   key: 'domain__entity_id'
        #   value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens
        self.knowledge, self.snippets = self._prepare_knowledge()
        print("# of snippets = ", len(self.snippets.keys()))
        print('\n\n')

        self._create_examples()
Beispiel #7
0
class Bert(torch.utils.data.Dataset):
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        # Bert special tokens
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["cls_token"])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])

        # PAD modified
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        # dataset_walker.py
        #   self.logs: logs.json
        #   self.labels: labels.json

        ## if labels_file passed in, use the output of task1 (baseline.ktd.json)
        ## only has target: True / False
        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)

        # self.dialogs: list of dictionary
        #   for train_baseline:
        #       format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},}
        #                {...},
        #                {...}]
        #       e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}}

        ##  for run_baseline: 'label' only has 'target'
        ##      format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},}
        ##               {...},
        ##               {...}]
        self.dialogs = self._prepare_conversations()

        # knowledge_reader.py
        #   self.knowledge: knowledge.json
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)

        # self.snippets: dictionary
        #   format: {key: value}
        #   key: 'domain__entity_id'
        #   value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens
        self.knowledge, self.snippets = self._prepare_knowledge()
        print("# of snippets = ", len(self.snippets.keys()))
        print('\n\n')

        self._create_examples()

    def _prepare_conversations(self):
        logger.info("Tokenize and encode the dialog data")
        tokenized_dialogs = []
        status = 0
        for i, (log, label) in enumerate(
                tqdm(self.dataset_walker,
                     disable=self.args.local_rank
                     not in [-1, 0])):  # only show progress bar in one process
            dialog = {}
            dialog["id"] = i
            dialog["log"] = log
            if label is not None:
                # should not have this part!!
                if "response" in label:
                    status = 1
                    label[
                        "response_tokenized"] = self.tokenizer.convert_tokens_to_ids(
                            self.tokenizer.tokenize(label["response"]))
            dialog["label"] = label
            tokenized_dialogs.append(dialog)
        print("dialog length = ", len(tokenized_dialogs))
        if status: print("Wrong!! It has response in label.json!! \n")
        else: print("No response in label.json\n")
        return tokenized_dialogs

    def _prepare_knowledge(self):
        knowledge = self.knowledge_reader.knowledge
        # self.knowledge_docs: list of dictionaries
        # self.knowledge_docs = self.knowledge_reader.get_doc_list()
        self.knowledge_docs = self.knowledge_reader.get_domain_entity_list()

        tokenized_snippets = dict()
        for snippet in self.knowledge_docs:
            key = "{}__{}".format(snippet["domain"],
                                  str(snippet["entity_id"]) or "")
            knowledge = self._knowledge_to_string(snippet["domain"],
                                                  name=snippet["entity_name"]
                                                  or "")
            tokenized_knowledge = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(knowledge))
            tokenized_snippets[key] = tokenized_knowledge[:self.args.
                                                          knowledge_max_tokens]
        print("knowledge length = ",
              len(tokenized_snippets))  # 145 = 33 + 110 + 1 + 1
        return knowledge, tokenized_snippets

    def _knowledge_to_string(self, doc, name=""):
        return doc["body"]

    def _create_examples(self):
        logger.info("Creating examples")

        # self.examples: list of dictionary
        self.examples = []

        for dialog in tqdm(self.dialogs,
                           disable=self.args.local_rank not in [-1, 0]):
            dialog_id = dialog["id"]
            label = dialog["label"]
            dialog = dialog["log"]
            if label is None:
                # This will only happen when running knowledge-seeking turn detection on test data
                # So we create dummy target here
                label = {"target": False}

            target = label["target"]  # True or False

            # target == false, for task2 & 3, ignore
            if not target and self.args.task != "detection":
                # we only care about non-knowledge-seeking turns in turn detection task
                continue

            # history: 2d list of one dialog, tokenized dialog text (no speaker info., later will be added manually)
            #   format: [[1st tokenized text], [2nd tokenized text], ...]
            history = [
                self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(turn["text"])) for turn in dialog
            ]

            # get response from label if exists
            ## no response for run_baseline (baseline.ktd.json)
            gt_resp = label.get("response", "")
            # tokenize response
            tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(gt_resp))

            # apply history threshold at an utterance-level (a large value can be used to nullify its effect)
            truncated_history = history[
                -self.args.history_max_utterances:]  # max num of utterance

            # data.py
            # perform token-level truncation of history from the left
            truncated_history = truncate_sequences(
                truncated_history,
                self.args.history_max_tokens)  # max num of tokens

            if target:  # for task2 & 3
                if "knowledge" not in label:
                    # when the labels.json is from knowledge-seeking turn detection,
                    # there will be no ground truth knowledge
                    # so we just use a dummy snippet here
                    if not self.args.eval_all_snippets:
                        raise ValueError(
                            "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection"
                        )
                    ## for run_baseline & ????? all validation set evaluation: ??? all dummy knowledge is the 1st knowledge snippet in knowledge.json
                    ## label has no meaning ??
                    label["knowledge"] = [self.knowledge_docs[0]]

                # knowledge: 1st knowledge snippet in labels.json or a dummy knowledge
                knowledge = label["knowledge"][0]

                knowledge_key = "{}__{}".format(knowledge["domain"],
                                                knowledge["entity_id"])
                # find snippets with same entity as candidates
                prefix = "{}".format(knowledge["domain"])

                # knowledge_candidates: list of strings, find keys in self.snippets that have the same prefix as the knowledge_key
                #   format: [key, key, ...]
                #   Fixed!! one problem: if knowledge_key == 'hotel__1', except all knowledge snippets of hotel entity 1, 'hotel_10', 'hotel_11' ... will also be included.
                # knowledge_candidates = [cand for cand in self.snippets.keys() if cand.startswith(prefix)]
                knowledge_candidates = [
                    cand for cand in self.snippets.keys()
                    if "__".join(cand.split("__")[:-1]) == prefix
                ]
                if self.split_type == "train" and self.args.negative_sample_method == "oracle":
                    # if there's not enough candidates during training, we just skip this example
                    if len(knowledge_candidates) < self.args.n_candidates:
                        continue

                ## for run_baseline: dummy knowledge, 1st knowledge snippet
                used_knowledge = self.snippets[knowledge_key]  # used knowledge
                used_knowledge = used_knowledge[:self.args.
                                                knowledge_max_tokens]  # tokenized used knowledge
            else:  # no need to do task2 & 3
                knowledge_candidates = None
                used_knowledge = []

            self.examples.append({
                "history":
                truncated_history,  # 2d list, list of tokenized texts
                "knowledge":
                used_knowledge,  # tokenized used knowledge ## dummy knowledge for run_baseline
                "candidates":
                knowledge_candidates,  # list of keys of knowledge snippets, negative sampling candidates ???
                "response": tokenized_gt_resp,
                "response_text": gt_resp,
                "label": label,
                "knowledge_seeking": target,
                "dialog_id": dialog_id
            })

    def build_input_from_segments(self,
                                  knowledge,
                                  history,
                                  response,
                                  with_eos=True):
        """ Build a sequence of input from 3 segments: knowledge, history and last reply """
        instance = {}

        sequence = [[self.bos] + knowledge] + history + [
            response + ([self.eos] if with_eos else [])
        ]
        sequence_with_speaker = [
            [self.speaker1 if
             (len(sequence) - i) % 2 == 0 else self.speaker2] + s
            for i, s in enumerate(sequence[1:])
        ]
        sequence = [sequence[0]] + sequence_with_speaker
        instance["input_ids"] = list(chain(*sequence))
        instance["token_type_ids"] = [
            self.speaker2 if i % 2 else self.speaker1
            for i, s in enumerate(sequence) for _ in s
        ]
        instance["mc_token_ids"] = len(instance["input_ids"]) - 1
        instance["lm_labels"] = (
            [-100] * sum(len(s)
                         for s in sequence[:-1])) + [-100] + sequence[-1][1:]

        return instance, sequence

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        return len(self.examples)
def main(argv):
    parser = argparse.ArgumentParser(
        description='Evaluate the system outputs.')

    parser.add_argument('--dataset',
                        dest='dataset',
                        action='store',
                        metavar='DATASET',
                        choices=['train', 'val', 'test'],
                        required=True,
                        help='The dataset to analyze')
    parser.add_argument(
        '--dataroot',
        dest='dataroot',
        action='store',
        metavar='PATH',
        required=True,
        help='Will look for corpus in <dataroot>/<dataset>/...')
    parser.add_argument("--knowledge_file",
                        type=str,
                        default="knowledge.json",
                        help="knowledge file name.")
    parser.add_argument("--sub_beam_size",
                        type=int,
                        default=2,
                        help="sub_beam_size")
    parser.add_argument("--group_size", type=int, default=4, help="group_size")
    parser.add_argument('--outfile',
                        dest='outfile',
                        action='store',
                        metavar='JSON_FILE',
                        required=True,
                        help='File containing output JSON')
    parser.add_argument('--get_response_version', type=str, default='new')
    parser.add_argument('--from_combine', action='store_true')
    parser.add_argument('--postfile', type=str, default='')

    args = parser.parse_args()

    with open(args.outfile, 'r') as f:
        output = json.load(f)
    if args.from_combine:
        postfile = args.postfile or re.sub(
            r'att_(\d+)_(\d+)', lambda m: f'att{m.group(2)}',
            args.outfile).replace('combine', 'post')
        with open(postfile, 'r') as f:
            post_output = json.load(f)

    knowledge_reader = KnowledgeReader(dataroot=args.dataroot,
                                       knowledge_file=args.knowledge_file)
    beam_size = args.sub_beam_size * args.group_size
    version = args.version

    get_response_and_score = partial(get_response_and_score_meta,
                                     ver=args.get_response_version)

    med_radio_list = []
    med_score_list = []
    whole_knowledge_list = []

    metric = Metric()
    for pid, pred in enumerate(output):
        if pred['target']:
            front_txt = []
            post_txt = []
            lm_scores = []
            ref_text = knowledge_reader.get_doc(
                **pred['knowledge'][0])['doc']['body']
            whole_knowledge_list.append(ref_text)
            p_response = pred['response']
            p_response_list = splitSentence(p_response)
            if len(p_response_list) > 1:
                p_response_list = [
                    ' '.join(p_response_list[:i])
                    for i in range(1, len(p_response_list))
                ]
                p_response_list_med = [
                    Levenshtein.distance(ref_text, candidate_text)
                    for candidate_text in p_response_list
                ]
                p_response_front = p_response_list[int(
                    np.argmin(p_response_list_med))]
                p_response_post = p_response[len(p_response_front) +
                                             1:].strip()
            for _id in range(beam_size):
                candidate = pred['beam_outputs'][f'id_{_id}']
                candidate_text, lm_score = get_response_and_score(candidate)
                candidate_text_list = splitSentence(candidate_text)
                if not args.from_combine:
                    lm_scores.append(lm_score)
                else:
                    post_cadidate = post_output[pid]['beam_outputs'][
                        f'id_{_id}']
                    _post_t, post_score = get_response_and_score(post_cadidate)
                    lm_scores.append(post_score)
                if len(candidate_text_list) > 1:
                    candidate_text_list = [
                        ' '.join(candidate_text_list[:i])
                        for i in range(1, len(candidate_text_list))
                    ]
                    candidate_text_list_med = [
                        Levenshtein.distance(ref_text, candidate_text)
                        for candidate_text in candidate_text_list
                    ]
                    candidate_text_after = candidate_text_list[int(
                        np.argmin(candidate_text_list_med))]
                    front_txt.append(candidate_text_after)
                    if args.from_combine:
                        post_txt.append(_post_t)
                    else:
                        post_txt.append(
                            candidate_text[len(candidate_text_after) +
                                           1:].strip())
                    candidate_text = candidate_text_after
                else:
                    front_txt.append(candidate_text)
                    post_txt.append(candidate_text)
                dis_func = Levenshtein.jaro_winskler
                med_radio_list.append(dis_func(candidate_text, ref_text))
                metric.update(ref_text, candidate_text, lm_score)

    scores = metric.score_list

    metric.cal_bertscore()
    bert_score = metric.bertscore
    lm_score = metric.lm_score

    bert_score = bert_score[2].reshape((-1, beam_size))
    lm_score = torch.tensor(lm_score).reshape((-1, beam_size))

    med_radio_score = torch.tensor(med_radio_list).reshape((-1, beam_size))
    lm_score = (lm_score - lm_score.min()) / (lm_score.max() - lm_score.min())
    set_zeros_lm_score(lm_score, args.sub_beam_size, args.group_size)
    bert_score -= bert_score.min(dim=-1, keepdim=True)[0]
    bert_score /= bert_score.max(dim=-1, keepdim=True)[0]
    med_part = torch.where(med_radio_score > 0.9, med_radio_score,
                           torch.zeros_like(med_radio_score)) * 0.5
    final_score = bert_score + lm_score - med_part
    print(med_radio_score[0])
    print(bert_score[0], lm_score[0], med_part[0])

    select = final_score.argmax(dim=-1)

    item_id = 0
    for pred in output:
        if pred['target']:
            candidate_text, _ = get_response_and_score(
                pred['beam_outputs'][f'id_{select[item_id].item()}'])
            pred['response'] = candidate_text
            item_id += 1

    with open(os.path.join(args.outfile[:-5] + f'_rerank{version}.json'),
              'w') as fout:
        json.dump(output, fout, indent=2)
Beispiel #9
0
class BaseDataset_Bert(torch.utils.data.Dataset):
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        # when train the model labels==True
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['cls_token'])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])
        #         self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token'])

        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations(
        )  # get the parsed dialog data from dataset_walker
        # print("dialogs: ",self.dialogs[0])
        '''eg.
          [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, 
          {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}]
          '''
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()

    def _prepare_conversations(self):  ## tokenize the dialog data
        logger.info("Tokenize and encode the dialog data")
        tokenized_dialogs = []
        for i, (log, label) in enumerate(
                tqdm(self.dataset_walker,
                     disable=self.args.local_rank
                     not in [-1, 0])):  # only show progress bar in one process
            dialog = {}
            dialog["id"] = i
            dialog["log"] = log
            if label is not None:
                if "response" in label:  #this is for task3: generate the response
                    label[
                        "response_tokenized"] = self.tokenizer.convert_tokens_to_ids(
                            self.tokenizer.tokenize(label["response"]))
            dialog["label"] = label
            tokenized_dialogs.append(dialog)
        return tokenized_dialogs

    def _prepare_knowledge(self):  ## prepare knowledge snippet
        knowledge = self.knowledge_reader.knowledge
        self.knowledge_docs = self.knowledge_reader.get_doc_list()

        tokenized_snippets = dict()
        for snippet in self.knowledge_docs:
            key = "{}__{}__{}".format(snippet["domain"],
                                      str(snippet["entity_id"]) or "",
                                      snippet["doc_id"])
            knowledge = self._knowledge_to_string(snippet["doc"],
                                                  name=snippet["entity_name"]
                                                  or "")
            tokenized_knowledge = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(knowledge))
            tokenized_snippets[key] = tokenized_knowledge[:self.args.
                                                          knowledge_max_tokens]
        return knowledge, tokenized_snippets

    def _knowledge_to_string(self,
                             doc,
                             name=""):  ## return the string if the knowledge
        return doc["body"]

    def _create_examples(self):
        logger.info("Creating examples")
        self.examples = []
        for dialog in tqdm(self.dialogs,
                           disable=self.args.local_rank not in [-1, 0]):
            dialog_id = dialog["id"]
            label = dialog["label"]
            dialog = dialog["log"]
            if label is None:  ## label is none only when it is evalutation phrase

                # This will only happen when running knowledge-seeking turn detection on test data (evaluation phrase)

                # So we create dummy target here
                label = {"target": False}

            target = label["target"]

            if not target and self.args.task != "detection":
                # we only care about non-knowledge-seeking turns in turn detection task
                continue
            #only target is False or task == detection then we can go further

            history = [
                self.tokenizer.convert_tokens_to_ids(
                    self.tokenizer.tokenize(turn["text"])) for turn in dialog
            ]  #dialog is a conversation, turns are the turns in the conversation, and they will be tokenized

            gt_resp = label.get("response", "")
            tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids(
                self.tokenizer.tokenize(gt_resp))

            # apply history threshold at an utterance-level (a large value can be used to nullify its effect)
            truncated_history = history[-self.args.history_max_utterances:]

            # perform token-level truncation of history from the left
            truncated_history = truncate_sequences(
                truncated_history, self.args.history_max_tokens
            )  #get sum of the history tokens less than history max tokens

            if target:  # if target==True (it will only happen in knowledge selection)
                if "knowledge" not in label:
                    # when the labels.json is from knowledge-seeking turn detection,
                    # there will be no ground truth knowledge
                    # so we just use a dummy snippet here
                    if not self.args.eval_all_snippets:
                        raise ValueError(
                            "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection"
                        )
                    label["knowledge"] = [self.knowledge_docs[0]]

                knowledge = label["knowledge"][0]
                knowledge_key = "{}__{}__{}".format(knowledge["domain"],
                                                    knowledge["entity_id"],
                                                    knowledge["doc_id"])
                # find snippets with same entity as candidates
                prefix = "{}__{}".format(knowledge["domain"],
                                         knowledge["entity_id"])
                knowledge_candidates = [
                    cand for cand in self.snippets.keys()
                    if cand.startswith(prefix)
                ]

                if self.split_type == "train" and self.args.negative_sample_method == "oracle":
                    # if there's not enough candidates during training, we just skip this example
                    if len(knowledge_candidates) < self.args.n_candidates:
                        continue
                used_knowledge = self.snippets[knowledge_key]
                used_knowledge = used_knowledge[:self.args.
                                                knowledge_max_tokens]  #get the knowledge from the right
            else:  # target==false will happen before detection or knowledge seeking is not needed
                knowledge_candidates = None
                used_knowledge = []

            self.examples.append({
                "history": truncated_history,  #tokenized history
                "knowledge": used_knowledge,  #it is none if target==false
                "candidates": knowledge_candidates,
                "response": tokenized_gt_resp,
                "response_text": gt_resp,
                "label": label,
                "knowledge_seeking": target,
                "dialog_id": dialog_id
            })

    def build_input_from_segments(self,
                                  knowledge,
                                  history,
                                  response,
                                  with_eos=True):
        """ Build a sequence of input from 3 segments: knowledge, history and last reply """
        instance = {}

        sequence = [[self.bos] + knowledge] + history + [
            response + ([self.eos] if with_eos else [])
        ]  # bos is the start token and eos is the last token
        sequence_with_speaker = [
            [self.speaker1 if
             (len(sequence) - i) % 2 == 0 else self.speaker2] +
            s  # get the list [[speaker i, sequence]...]
            for i, s in enumerate(sequence[1:])
        ]
        sequence = [sequence[0]] + sequence_with_speaker
        instance["input_ids"] = list(chain(*sequence))
        instance["token_type_ids"] = [
            self.speaker2 if i % 2 else self.speaker1
            for i, s in enumerate(sequence) for _ in s
        ]
        instance["mc_token_ids"] = len(instance["input_ids"]) - 1
        instance["lm_labels"] = (
            [-100] * sum(len(s)
                         for s in sequence[:-1])) + [-100] + sequence[-1][1:]

        return instance, sequence

    def __getitem__(self, index):
        raise NotImplementedError

    def __len__(self):
        return len(self.examples)