コード例 #1
0
ファイル: corpora.py プロジェクト: zixufang/multiwoz-mdrg
 def _to_id_corpus(self, name, data):
     results = []
     for dlg in data:
         if len(dlg.dlg) < 1:
             continue
         id_dlg = []
         for turn in dlg.dlg:
             id_turn = Pack(utt=self._sent2id(turn.utt),
                            speaker=turn.speaker)
             id_dlg.append(id_turn)
         id_goal = self._goal2id(dlg.goal)
         id_out = self._outcome2id(dlg.out)
         results.append(Pack(dlg=id_dlg, goal=id_goal, out=id_out))
     return results
コード例 #2
0
ファイル: corpora.py プロジェクト: zixufang/multiwoz-mdrg
 def _to_id_corpus(self, name, data):
     results = []
     for dlg in data:
         if len(dlg.dlg) < 1:
             continue
         id_dlg = []
         for turn in dlg.dlg:
             id_turn = Pack(utt=self._sent2id(turn.utt),
                            speaker=turn.speaker,
                            db=turn.db,
                            bs=turn.bs)
             id_dlg.append(id_turn)
         id_goal = self._goal2id(dlg.goal)
         results.append(Pack(dlg=id_dlg, goal=id_goal, key=dlg.key))
     return results
コード例 #3
0
ファイル: corpora.py プロジェクト: zixufang/multiwoz-mdrg
    def _process_dialogue(self, data):
        new_dlgs = []
        all_sent_lens = []
        all_dlg_lens = []

        for key, raw_dlg in data.items():
            norm_dlg = [
                Pack(speaker=USR,
                     utt=[BOS, BOD, EOS],
                     bs=[0.0] * self.bs_size,
                     db=[0.0] * self.db_size)
            ]
            for t_id in range(len(raw_dlg['db'])):
                usr_utt = [BOS] + self.tokenize(raw_dlg['usr'][t_id]) + [EOS]
                sys_utt = [BOS] + self.tokenize(raw_dlg['sys'][t_id]) + [EOS]
                norm_dlg.append(
                    Pack(speaker=USR,
                         utt=usr_utt,
                         db=raw_dlg['db'][t_id],
                         bs=raw_dlg['bs'][t_id]))
                norm_dlg.append(
                    Pack(speaker=SYS,
                         utt=sys_utt,
                         db=raw_dlg['db'][t_id],
                         bs=raw_dlg['bs'][t_id]))
                all_sent_lens.extend([len(usr_utt), len(sys_utt)])
            # To stop dialog
            norm_dlg.append(
                Pack(speaker=USR,
                     utt=[BOS, EOD, EOS],
                     bs=[0.0] * self.bs_size,
                     db=[0.0] * self.db_size))
            # if self.config.to_learn == 'usr':
            #     norm_dlg.append(Pack(speaker=USR, utt=[BOS, EOD, EOS], bs=[0.0]*self.bs_size, db=[0.0]*self.db_size))
            all_dlg_lens.append(len(raw_dlg['db']))
            processed_goal = self._process_goal(raw_dlg['goal'])
            new_dlgs.append(Pack(dlg=norm_dlg, goal=processed_goal, key=key))

        self.logger.info(
            'Max utt len = %d, mean utt len = %.2f' %
            (np.max(all_sent_lens), float(np.mean(all_sent_lens))))
        self.logger.info('Max dlg len = %d, mean dlg len = %.2f' %
                         (np.max(all_dlg_lens), float(np.mean(all_dlg_lens))))
        return new_dlgs
コード例 #4
0
    def _prepare_batch(self, selected_index):
        rows = [self.data[idx] for idx in selected_index]

        ctx_utts, ctx_lens = [], []
        out_utts, out_lens = [], []
        goals, goal_lens = [], []

        for row in rows:
            in_row, out_row, goal_row = row.context, row.response, row.goal

            # source context
            batch_ctx = []
            for turn in in_row:
                batch_ctx.append(
                    self.pad_to(self.max_utt_len, turn.utt, do_pad=True))
            ctx_utts.append(batch_ctx)
            ctx_lens.append(len(batch_ctx))

            # target response
            out_utt = [t for idx, t in enumerate(out_row.utt)]
            out_utts.append(out_utt)
            out_lens.append(len(out_utt))

            # goal
            goals.append(goal_row)
            goal_lens.append(len(goal_row))

        vec_ctx_lens = np.array(ctx_lens)  # (batch_size, ), number of turns
        max_ctx_len = np.max(vec_ctx_lens)
        vec_ctx_utts = np.zeros(
            (self.batch_size, max_ctx_len, self.max_utt_len), dtype=np.int32)
        # confs is used to add some hand-crafted features
        vec_ctx_confs = np.ones((self.batch_size, max_ctx_len),
                                dtype=np.float32)
        vec_out_lens = np.array(out_lens)  # (batch_size, ), number of tokens
        max_out_len = np.max(vec_out_lens)
        vec_out_utts = np.zeros((self.batch_size, max_out_len), dtype=np.int32)

        max_goal_len, min_goal_len = max(goal_lens), min(goal_lens)
        if max_goal_len != min_goal_len or max_goal_len != 6:
            print('FATAL ERROR!')
            exit(-1)
        self.goal_len = max_goal_len
        vec_goals = np.zeros((self.batch_size, self.goal_len), dtype=np.int32)

        for b_id in range(self.batch_size):
            vec_ctx_utts[b_id, :vec_ctx_lens[b_id], :] = ctx_utts[b_id]
            vec_out_utts[b_id, :vec_out_lens[b_id]] = out_utts[b_id]
            vec_goals[b_id, :] = goals[b_id]

        return Pack(context_lens=vec_ctx_lens, \
                    contexts=vec_ctx_utts, \
                    context_confs=vec_ctx_confs, \
                    output_lens=vec_out_lens, \
                    outputs=vec_out_utts, \
                    goals=vec_goals)
コード例 #5
0
ファイル: corpora.py プロジェクト: zixufang/multiwoz-mdrg
        def transform(token_list):
            usr, sys = [], []
            ptr = 0
            while ptr < len(token_list):
                turn_ptr = ptr
                turn_list = []
                while True:
                    cur_token = token_list[turn_ptr]
                    turn_list.append(cur_token)
                    turn_ptr += 1
                    if cur_token == EOS:
                        ptr = turn_ptr
                        break
                all_sent_lens.append(len(turn_list))
                if turn_list[0] == USR:
                    usr.append(Pack(utt=turn_list, speaker=USR))
                elif turn_list[0] == SYS:
                    sys.append(Pack(utt=turn_list, speaker=SYS))
                else:
                    raise ValueError('Invalid speaker')

            all_dlg_lens.append(len(usr) + len(sys))
            return usr, sys
コード例 #6
0
 def flatten_dialog(self, data, backward_size):
     results = []
     for dlg in data:
         goal = dlg.goal
         for i in range(1, len(dlg.dlg)):
             if dlg.dlg[i].speaker == USR:
                 continue
             e_idx = i
             s_idx = max(0, e_idx - backward_size)
             response = dlg.dlg[i].copy()
             response['utt'] = self.pad_to(self.max_utt_len,
                                           response.utt,
                                           do_pad=False)
             context = []
             for turn in dlg.dlg[s_idx:e_idx]:
                 turn['utt'] = self.pad_to(self.max_utt_len,
                                           turn.utt,
                                           do_pad=False)
                 context.append(turn)
             results.append(
                 Pack(context=context, response=response, goal=goal))
     return results
コード例 #7
0
 def flatten_dialog(self, data, backward_size):
     results = []
     indexes = []
     batch_indexes = []
     resp_set = set()
     for dlg in data:
         goal = dlg.goal
         key = dlg.key
         batch_index = []
         for i in range(1, len(dlg.dlg)):
             if dlg.dlg[i].speaker == USR:
                 continue
             e_idx = i
             s_idx = max(0, e_idx - backward_size)
             response = dlg.dlg[i].copy()
             response['utt'] = self.pad_to(self.max_utt_len,
                                           response.utt,
                                           do_pad=False)
             resp_set.add(json.dumps(response.utt))
             context = []
             for turn in dlg.dlg[s_idx:e_idx]:
                 turn['utt'] = self.pad_to(self.max_utt_len,
                                           turn.utt,
                                           do_pad=False)
                 context.append(turn)
             results.append(
                 Pack(context=context,
                      response=response,
                      goal=goal,
                      key=key))
             indexes.append(len(indexes))
             batch_index.append(indexes[-1])
         if len(batch_index) > 0:
             batch_indexes.append(batch_index)
     print("Unique resp {}".format(len(resp_set)))
     return results, indexes, batch_indexes
コード例 #8
0
ファイル: corpora.py プロジェクト: zixufang/multiwoz-mdrg
    def _process_dialogue(self, data):
        def transform(token_list):
            usr, sys = [], []
            ptr = 0
            while ptr < len(token_list):
                turn_ptr = ptr
                turn_list = []
                while True:
                    cur_token = token_list[turn_ptr]
                    turn_list.append(cur_token)
                    turn_ptr += 1
                    if cur_token == EOS:
                        ptr = turn_ptr
                        break
                all_sent_lens.append(len(turn_list))
                if turn_list[0] == USR:
                    usr.append(Pack(utt=turn_list, speaker=USR))
                elif turn_list[0] == SYS:
                    sys.append(Pack(utt=turn_list, speaker=SYS))
                else:
                    raise ValueError('Invalid speaker')

            all_dlg_lens.append(len(usr) + len(sys))
            return usr, sys

        new_dlg = []
        all_sent_lens = []
        all_dlg_lens = []
        for raw_dlg in data:
            raw_words = raw_dlg.split()

            # process dialogue text
            cur_dlg = []
            words = raw_words[raw_words.index('<dialogue>') +
                              1:raw_words.index('</dialogue>')]
            words += [EOS]
            usr_first = True
            if words[0] == SYS:
                words = [USR, BOD, EOS] + words
                usr_first = True
            elif words[0] == USR:
                words = [SYS, BOD, EOS] + words
                usr_first = False
            else:
                print('FATAL ERROR!!! ({})'.format(words))
                exit(-1)
            usr_utts, sys_utts = transform(words)
            for usr_turn, sys_turn in zip(usr_utts, sys_utts):
                if usr_first:
                    cur_dlg.append(usr_turn)
                    cur_dlg.append(sys_turn)
                else:
                    cur_dlg.append(sys_turn)
                    cur_dlg.append(usr_turn)
            if len(usr_utts) - len(sys_utts) == 1:
                cur_dlg.append(usr_utts[-1])
            elif len(sys_utts) - len(usr_utts) == 1:
                cur_dlg.append(sys_utts[-1])

            # process goal (6 digits)
            # FIXME FATAL ERROR HERE !!!
            cur_goal = raw_words[raw_words.index('<partner_input>') +
                                 1:raw_words.index('</partner_input>')]
            # cur_goal = raw_words[raw_words.index('<input>')+1: raw_words.index('</input>')]
            if len(cur_goal) != 6:
                print('FATAL ERROR!!! ({})'.format(cur_goal))
                exit(-1)

            # process outcome (6 tokens)
            cur_out = raw_words[raw_words.index('<output>') +
                                1:raw_words.index('</output>')]
            if len(cur_out) != 6:
                print('FATAL ERROR!!! ({})'.format(cur_out))
                exit(-1)

            new_dlg.append(Pack(dlg=cur_dlg, goal=cur_goal, out=cur_out))

        print('Max utt len = %d, mean utt len = %.2f' %
              (np.max(all_sent_lens), float(np.mean(all_sent_lens))))
        print('Max dlg len = %d, mean dlg len = %.2f' %
              (np.max(all_dlg_lens), float(np.mean(all_dlg_lens))))
        return new_dlg
コード例 #9
0
    def _prepare_batch(self, selected_index):
        rows = [self.data[idx] for idx in selected_index]

        ctx_utts, ctx_lens = [], []
        out_utts, out_lens = [], []

        out_bs, out_db = [], []
        goals, goal_lens = [], [[] for _ in range(len(self.domains))]
        keys = []

        for row in rows:
            in_row, out_row, goal_row = row.context, row.response, row.goal

            # source context
            keys.append(row.key)
            batch_ctx = []
            for turn in in_row:
                batch_ctx.append(
                    self.pad_to(self.max_utt_len, turn.utt, do_pad=True))
            ctx_utts.append(batch_ctx)
            ctx_lens.append(len(batch_ctx))

            # target response
            out_utt = [t for idx, t in enumerate(out_row.utt)]
            out_utts.append(out_utt)
            out_lens.append(len(out_utt))

            out_bs.append(out_row.bs)
            out_db.append(out_row.db)

            # goal
            goals.append(goal_row)
            for i, d in enumerate(self.domains):
                goal_lens[i].append(len(goal_row[d]))

        batch_size = len(ctx_lens)
        vec_ctx_lens = np.array(ctx_lens)  # (batch_size, ), number of turns
        max_ctx_len = np.max(vec_ctx_lens)
        vec_ctx_utts = np.zeros((batch_size, max_ctx_len, self.max_utt_len),
                                dtype=np.int32)
        vec_out_bs = np.array(out_bs)  # (batch_size, 94)
        vec_out_db = np.array(out_db)  # (batch_size, 30)
        vec_out_lens = np.array(out_lens)  # (batch_size, ), number of tokens
        max_out_len = np.max(vec_out_lens)
        vec_out_utts = np.zeros((batch_size, max_out_len), dtype=np.int32)

        max_goal_lens, min_goal_lens = [max(ls) for ls in goal_lens
                                        ], [min(ls) for ls in goal_lens]
        if max_goal_lens != min_goal_lens:
            print('Fatal Error!')
            exit(-1)
        self.goal_lens = max_goal_lens
        vec_goals_list = [
            np.zeros((batch_size, l), dtype=np.float32) for l in self.goal_lens
        ]

        for b_id in range(batch_size):
            vec_ctx_utts[b_id, :vec_ctx_lens[b_id], :] = ctx_utts[b_id]
            vec_out_utts[b_id, :vec_out_lens[b_id]] = out_utts[b_id]
            for i, d in enumerate(self.domains):
                vec_goals_list[i][b_id, :] = goals[b_id][d]

        return Pack(
            context_lens=vec_ctx_lens,  # (batch_size, )
            contexts=vec_ctx_utts,  # (batch_size, max_ctx_len, max_utt_len)
            output_lens=vec_out_lens,  # (batch_size, )
            outputs=vec_out_utts,  # (batch_size, max_out_len)
            bs=vec_out_bs,  # (batch_size, 94)
            db=vec_out_db,  # (batch_size, 30)
            goals_list=
            vec_goals_list,  # 7*(batch_size, bow_len), bow_len differs w.r.t. domain
            keys=keys)