def _to_id_corpus(self, name, data): results = [] for dlg in data: if len(dlg.dlg) < 1: continue id_dlg = [] for turn in dlg.dlg: id_turn = Pack(utt=self._sent2id(turn.utt), speaker=turn.speaker) id_dlg.append(id_turn) id_goal = self._goal2id(dlg.goal) id_out = self._outcome2id(dlg.out) results.append(Pack(dlg=id_dlg, goal=id_goal, out=id_out)) return results
def _to_id_corpus(self, name, data): results = [] for dlg in data: if len(dlg.dlg) < 1: continue id_dlg = [] for turn in dlg.dlg: id_turn = Pack(utt=self._sent2id(turn.utt), speaker=turn.speaker, db=turn.db, bs=turn.bs) id_dlg.append(id_turn) id_goal = self._goal2id(dlg.goal) results.append(Pack(dlg=id_dlg, goal=id_goal, key=dlg.key)) return results
def _process_dialogue(self, data): new_dlgs = [] all_sent_lens = [] all_dlg_lens = [] for key, raw_dlg in data.items(): norm_dlg = [ Pack(speaker=USR, utt=[BOS, BOD, EOS], bs=[0.0] * self.bs_size, db=[0.0] * self.db_size) ] for t_id in range(len(raw_dlg['db'])): usr_utt = [BOS] + self.tokenize(raw_dlg['usr'][t_id]) + [EOS] sys_utt = [BOS] + self.tokenize(raw_dlg['sys'][t_id]) + [EOS] norm_dlg.append( Pack(speaker=USR, utt=usr_utt, db=raw_dlg['db'][t_id], bs=raw_dlg['bs'][t_id])) norm_dlg.append( Pack(speaker=SYS, utt=sys_utt, db=raw_dlg['db'][t_id], bs=raw_dlg['bs'][t_id])) all_sent_lens.extend([len(usr_utt), len(sys_utt)]) # To stop dialog norm_dlg.append( Pack(speaker=USR, utt=[BOS, EOD, EOS], bs=[0.0] * self.bs_size, db=[0.0] * self.db_size)) # if self.config.to_learn == 'usr': # norm_dlg.append(Pack(speaker=USR, utt=[BOS, EOD, EOS], bs=[0.0]*self.bs_size, db=[0.0]*self.db_size)) all_dlg_lens.append(len(raw_dlg['db'])) processed_goal = self._process_goal(raw_dlg['goal']) new_dlgs.append(Pack(dlg=norm_dlg, goal=processed_goal, key=key)) self.logger.info( 'Max utt len = %d, mean utt len = %.2f' % (np.max(all_sent_lens), float(np.mean(all_sent_lens)))) self.logger.info('Max dlg len = %d, mean dlg len = %.2f' % (np.max(all_dlg_lens), float(np.mean(all_dlg_lens)))) return new_dlgs
def _prepare_batch(self, selected_index): rows = [self.data[idx] for idx in selected_index] ctx_utts, ctx_lens = [], [] out_utts, out_lens = [], [] goals, goal_lens = [], [] for row in rows: in_row, out_row, goal_row = row.context, row.response, row.goal # source context batch_ctx = [] for turn in in_row: batch_ctx.append( self.pad_to(self.max_utt_len, turn.utt, do_pad=True)) ctx_utts.append(batch_ctx) ctx_lens.append(len(batch_ctx)) # target response out_utt = [t for idx, t in enumerate(out_row.utt)] out_utts.append(out_utt) out_lens.append(len(out_utt)) # goal goals.append(goal_row) goal_lens.append(len(goal_row)) vec_ctx_lens = np.array(ctx_lens) # (batch_size, ), number of turns max_ctx_len = np.max(vec_ctx_lens) vec_ctx_utts = np.zeros( (self.batch_size, max_ctx_len, self.max_utt_len), dtype=np.int32) # confs is used to add some hand-crafted features vec_ctx_confs = np.ones((self.batch_size, max_ctx_len), dtype=np.float32) vec_out_lens = np.array(out_lens) # (batch_size, ), number of tokens max_out_len = np.max(vec_out_lens) vec_out_utts = np.zeros((self.batch_size, max_out_len), dtype=np.int32) max_goal_len, min_goal_len = max(goal_lens), min(goal_lens) if max_goal_len != min_goal_len or max_goal_len != 6: print('FATAL ERROR!') exit(-1) self.goal_len = max_goal_len vec_goals = np.zeros((self.batch_size, self.goal_len), dtype=np.int32) for b_id in range(self.batch_size): vec_ctx_utts[b_id, :vec_ctx_lens[b_id], :] = ctx_utts[b_id] vec_out_utts[b_id, :vec_out_lens[b_id]] = out_utts[b_id] vec_goals[b_id, :] = goals[b_id] return Pack(context_lens=vec_ctx_lens, \ contexts=vec_ctx_utts, \ context_confs=vec_ctx_confs, \ output_lens=vec_out_lens, \ outputs=vec_out_utts, \ goals=vec_goals)
def transform(token_list): usr, sys = [], [] ptr = 0 while ptr < len(token_list): turn_ptr = ptr turn_list = [] while True: cur_token = token_list[turn_ptr] turn_list.append(cur_token) turn_ptr += 1 if cur_token == EOS: ptr = turn_ptr break all_sent_lens.append(len(turn_list)) if turn_list[0] == USR: usr.append(Pack(utt=turn_list, speaker=USR)) elif turn_list[0] == SYS: sys.append(Pack(utt=turn_list, speaker=SYS)) else: raise ValueError('Invalid speaker') all_dlg_lens.append(len(usr) + len(sys)) return usr, sys
def flatten_dialog(self, data, backward_size): results = [] for dlg in data: goal = dlg.goal for i in range(1, len(dlg.dlg)): if dlg.dlg[i].speaker == USR: continue e_idx = i s_idx = max(0, e_idx - backward_size) response = dlg.dlg[i].copy() response['utt'] = self.pad_to(self.max_utt_len, response.utt, do_pad=False) context = [] for turn in dlg.dlg[s_idx:e_idx]: turn['utt'] = self.pad_to(self.max_utt_len, turn.utt, do_pad=False) context.append(turn) results.append( Pack(context=context, response=response, goal=goal)) return results
def flatten_dialog(self, data, backward_size): results = [] indexes = [] batch_indexes = [] resp_set = set() for dlg in data: goal = dlg.goal key = dlg.key batch_index = [] for i in range(1, len(dlg.dlg)): if dlg.dlg[i].speaker == USR: continue e_idx = i s_idx = max(0, e_idx - backward_size) response = dlg.dlg[i].copy() response['utt'] = self.pad_to(self.max_utt_len, response.utt, do_pad=False) resp_set.add(json.dumps(response.utt)) context = [] for turn in dlg.dlg[s_idx:e_idx]: turn['utt'] = self.pad_to(self.max_utt_len, turn.utt, do_pad=False) context.append(turn) results.append( Pack(context=context, response=response, goal=goal, key=key)) indexes.append(len(indexes)) batch_index.append(indexes[-1]) if len(batch_index) > 0: batch_indexes.append(batch_index) print("Unique resp {}".format(len(resp_set))) return results, indexes, batch_indexes
def _process_dialogue(self, data): def transform(token_list): usr, sys = [], [] ptr = 0 while ptr < len(token_list): turn_ptr = ptr turn_list = [] while True: cur_token = token_list[turn_ptr] turn_list.append(cur_token) turn_ptr += 1 if cur_token == EOS: ptr = turn_ptr break all_sent_lens.append(len(turn_list)) if turn_list[0] == USR: usr.append(Pack(utt=turn_list, speaker=USR)) elif turn_list[0] == SYS: sys.append(Pack(utt=turn_list, speaker=SYS)) else: raise ValueError('Invalid speaker') all_dlg_lens.append(len(usr) + len(sys)) return usr, sys new_dlg = [] all_sent_lens = [] all_dlg_lens = [] for raw_dlg in data: raw_words = raw_dlg.split() # process dialogue text cur_dlg = [] words = raw_words[raw_words.index('<dialogue>') + 1:raw_words.index('</dialogue>')] words += [EOS] usr_first = True if words[0] == SYS: words = [USR, BOD, EOS] + words usr_first = True elif words[0] == USR: words = [SYS, BOD, EOS] + words usr_first = False else: print('FATAL ERROR!!! ({})'.format(words)) exit(-1) usr_utts, sys_utts = transform(words) for usr_turn, sys_turn in zip(usr_utts, sys_utts): if usr_first: cur_dlg.append(usr_turn) cur_dlg.append(sys_turn) else: cur_dlg.append(sys_turn) cur_dlg.append(usr_turn) if len(usr_utts) - len(sys_utts) == 1: cur_dlg.append(usr_utts[-1]) elif len(sys_utts) - len(usr_utts) == 1: cur_dlg.append(sys_utts[-1]) # process goal (6 digits) # FIXME FATAL ERROR HERE !!! cur_goal = raw_words[raw_words.index('<partner_input>') + 1:raw_words.index('</partner_input>')] # cur_goal = raw_words[raw_words.index('<input>')+1: raw_words.index('</input>')] if len(cur_goal) != 6: print('FATAL ERROR!!! ({})'.format(cur_goal)) exit(-1) # process outcome (6 tokens) cur_out = raw_words[raw_words.index('<output>') + 1:raw_words.index('</output>')] if len(cur_out) != 6: print('FATAL ERROR!!! ({})'.format(cur_out)) exit(-1) new_dlg.append(Pack(dlg=cur_dlg, goal=cur_goal, out=cur_out)) print('Max utt len = %d, mean utt len = %.2f' % (np.max(all_sent_lens), float(np.mean(all_sent_lens)))) print('Max dlg len = %d, mean dlg len = %.2f' % (np.max(all_dlg_lens), float(np.mean(all_dlg_lens)))) return new_dlg
def _prepare_batch(self, selected_index): rows = [self.data[idx] for idx in selected_index] ctx_utts, ctx_lens = [], [] out_utts, out_lens = [], [] out_bs, out_db = [], [] goals, goal_lens = [], [[] for _ in range(len(self.domains))] keys = [] for row in rows: in_row, out_row, goal_row = row.context, row.response, row.goal # source context keys.append(row.key) batch_ctx = [] for turn in in_row: batch_ctx.append( self.pad_to(self.max_utt_len, turn.utt, do_pad=True)) ctx_utts.append(batch_ctx) ctx_lens.append(len(batch_ctx)) # target response out_utt = [t for idx, t in enumerate(out_row.utt)] out_utts.append(out_utt) out_lens.append(len(out_utt)) out_bs.append(out_row.bs) out_db.append(out_row.db) # goal goals.append(goal_row) for i, d in enumerate(self.domains): goal_lens[i].append(len(goal_row[d])) batch_size = len(ctx_lens) vec_ctx_lens = np.array(ctx_lens) # (batch_size, ), number of turns max_ctx_len = np.max(vec_ctx_lens) vec_ctx_utts = np.zeros((batch_size, max_ctx_len, self.max_utt_len), dtype=np.int32) vec_out_bs = np.array(out_bs) # (batch_size, 94) vec_out_db = np.array(out_db) # (batch_size, 30) vec_out_lens = np.array(out_lens) # (batch_size, ), number of tokens max_out_len = np.max(vec_out_lens) vec_out_utts = np.zeros((batch_size, max_out_len), dtype=np.int32) max_goal_lens, min_goal_lens = [max(ls) for ls in goal_lens ], [min(ls) for ls in goal_lens] if max_goal_lens != min_goal_lens: print('Fatal Error!') exit(-1) self.goal_lens = max_goal_lens vec_goals_list = [ np.zeros((batch_size, l), dtype=np.float32) for l in self.goal_lens ] for b_id in range(batch_size): vec_ctx_utts[b_id, :vec_ctx_lens[b_id], :] = ctx_utts[b_id] vec_out_utts[b_id, :vec_out_lens[b_id]] = out_utts[b_id] for i, d in enumerate(self.domains): vec_goals_list[i][b_id, :] = goals[b_id][d] return Pack( context_lens=vec_ctx_lens, # (batch_size, ) contexts=vec_ctx_utts, # (batch_size, max_ctx_len, max_utt_len) output_lens=vec_out_lens, # (batch_size, ) outputs=vec_out_utts, # (batch_size, max_out_len) bs=vec_out_bs, # (batch_size, 94) db=vec_out_db, # (batch_size, 30) goals_list= vec_goals_list, # 7*(batch_size, bow_len), bow_len differs w.r.t. domain keys=keys)