def expert_generator(self): env = self.env_list[0] traj_len = 40 reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], [] success_dialog = [] while len(success_dialog)<10000: seed = np.random.randint(2000000) s = env.reset(seed) print('seed', seed) print('goal', env.goal.domain_goals) print('usr', s['user_action']) turn = traj_len reward = [] value = [] mask = [] dialog_turn = [] for t in range(traj_len): s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE) # mode with policy during evaluation a = self.policy.select_action(s_vec, False) next_s, done = env.step(s, a.cpu()) next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE) r = self.reward_human(s, done) pair = (s_vec, a, next_s_vec, r, done) dialog_turn.append(copy.deepcopy(pair)) s = next_s if done: mask.append(0) turn = t+2 # one due to counting from 0, the one for the last turn break if r > 0: success_dialog += dialog_turn logging.info("success dialog: {}".format(len(success_dialog))) torch.save(success_dialog, './data/expert_dialog_art.pt')
def create_dataset(self, part, file_dir, cfg, db): datas = self.data[part] goals = self.goal[part] s, a, next_s, a_seq = [], [], [], [] d_m = [] for idx, turn_data in enumerate(datas): if turn_data['others']['turn'] % 2 == 0: continue turn_data['user_goal'] = goals[turn_data['others']['session_id']] s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True))) a.append(torch.Tensor(action_vectorize(turn_data, cfg))) d_m.append(torch.Tensor(domain_vectorize(turn_data, cfg))) a_seq.append(torch.Tensor(action_seq(turn_data, cfg))) if not int(turn_data['others']['terminal']): next_s.append( torch.Tensor(state_vectorize(datas[idx + 2], cfg, db, True))) else: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = {} next_turn_data['last_sys_action'] = next_turn_data[ 'sys_action'] next_turn_data['sys_action'] = {} next_turn_data['belief_state'] = next_turn_data[ 'next_belief_state'] next_s.append( torch.Tensor(state_vectorize(next_turn_data, cfg, db, True))) torch.save((s, a, next_s, d_m, a_seq), file_dir)
def evaluate_with_agenda(self, env, N): """ 和上面不同的是,这里是使用定义好的env,上面实现的是内置的env, 用于针对系统agent,专门进行验证的 """ logging.info('eval: agenda 2 system') traj_len = 40 turn_tot, inform_tot, match_tot, success_tot = [], [], [], [] for seed in range(N): # s = env.reset(seed) s = env.reset() print('seed', seed) print('goal', env.goal.domain_goals) print('usr', s['user_action']) for t in range(traj_len): s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE) # mode with policy during evaluation a = self.policy_sys.select_action(s_vec, False) # 这一步操作,应该已经在env中包含了用户的操作,以及状态变更 next_s, done = env.step(s, a.cpu()) s = next_s print('sys', s['sys_action']) print('usr', s['user_action']) if done: break s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE) # mode with policy during evaluation a = self.policy_sys.select_action(s_vec, False) s = env.update_belief_sys(s, a.cpu()) print('sys', s['sys_action']) assert (env.time_step % 2 == 0) turn_tot.append(env.time_step // 2) match_tot += env.evaluator.match_rate(aggregate=False) inform_tot.append(env.evaluator.inform_F1(aggregate=False)) print('turn', env.time_step // 2) match_session = env.evaluator.match_rate() print('match', match_session) inform_session = env.evaluator.inform_F1() print('inform', inform_session) if (match_session == 1 and inform_session[1] == 1) \ or (match_session == 1 and inform_session[1] is None) \ or (match_session is None and inform_session[1] == 1): print('success', 1) success_tot.append(1) else: print('success', 0) success_tot.append(0) logging.info('turn {}'.format(np.mean(turn_tot))) logging.info('match {}'.format(np.mean(match_tot))) TP, FP, FN = np.sum(inform_tot, 0) prec = TP / (TP + FP) rec = TP / (TP + FN) F1 = 2 * prec * rec / (prec + rec) logging.info('inform rec {}, F1 {}'.format(rec, F1)) logging.info('success {}'.format(np.mean(success_tot)))
def create_dataset_global(self, part, file_dir, data_dir, cfg, db): datas = self.data[part] goals = self.goal[part] s_usr, s_sys, r_g, next_s_usr, next_s_sys, t = [], [], [], [], [], [] evaluator = MultiWozEvaluator(data_dir) for idx, turn_data in enumerate(datas): if turn_data['others']['turn'] % 2 == 0: if turn_data['others']['turn'] == 0: current_goal = goals[turn_data['others']['session_id']] evaluator.add_goal(current_goal) else: next_s_usr.append(s_usr[-1]) if turn_data['others']['change'] and evaluator.cur_domain: if 'final' in current_goal[evaluator.cur_domain]: for key in current_goal[evaluator.cur_domain]['final']: current_goal[evaluator.cur_domain][key] = current_goal[evaluator.cur_domain]['final'][key] del(current_goal[evaluator.cur_domain]['final']) turn_data['user_goal'] = deepcopy(current_goal) s_usr.append(torch.Tensor(state_vectorize_user(turn_data, cfg, evaluator.cur_domain))) evaluator.add_usr_da(turn_data['trg_user_action']) if turn_data['others']['terminal']: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = turn_data['trg_user_action'] next_turn_data['sys_action'] = datas[idx+1]['trg_sys_action'] next_turn_data['trg_user_action'] = {} next_turn_data['goal_state'] = datas[idx+1]['final_goal_state'] next_s_usr.append(torch.Tensor(state_vectorize_user(next_turn_data, cfg, evaluator.cur_domain))) else: if turn_data['others']['turn'] != 1: next_s_sys.append(s_sys[-1]) s_sys.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True))) evaluator.add_sys_da(turn_data['trg_sys_action']) if turn_data['others']['terminal']: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = {} next_turn_data['sys_action'] = turn_data['trg_sys_action'] next_turn_data['trg_sys_action'] = {} next_turn_data['belief_state'] = turn_data['final_belief_state'] next_s_sys.append(torch.Tensor(state_vectorize(next_turn_data, cfg, db, True))) reward_g = 20 if evaluator.task_success() else -5 r_g.append(reward_g) t.append(1) else: reward_g = 5 if evaluator.cur_domain and evaluator.domain_success(evaluator.cur_domain) else -1 r_g.append(reward_g) t.append(0) torch.save((s_usr, s_sys, r_g, next_s_usr, next_s_sys, t), file_dir)
def create_dataset_sys(self, part, file_dir, data_dir, cfg, db): datas = self.data[part] goals = self.goal[part] s, a, r, next_s, t = [], [], [], [], [] evaluator = MultiWozEvaluator(data_dir) for idx, turn_data in enumerate(datas): if turn_data['others']['turn'] % 2 == 0: if turn_data['others']['turn'] == 0: evaluator.add_goal( goals[turn_data['others']['session_id']]) evaluator.add_usr_da(turn_data['trg_user_action']) continue if turn_data['others']['turn'] != 1: next_s.append(s[-1]) s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True))) a.append( torch.Tensor(action_vectorize(turn_data['trg_sys_action'], cfg))) evaluator.add_sys_da(turn_data['trg_sys_action']) if turn_data['others']['terminal']: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = {} next_turn_data['sys_action'] = turn_data['trg_sys_action'] next_turn_data['trg_sys_action'] = {} next_turn_data['belief_state'] = turn_data[ 'final_belief_state'] next_s.append( torch.Tensor(state_vectorize(next_turn_data, cfg, db, True))) reward = 20 if evaluator.task_success(False) else -5 r.append(reward) t.append(1) else: reward = 0 if evaluator.cur_domain: for slot, value in turn_data['belief_state'][ evaluator.cur_domain].items(): if value == '?': for da in turn_data['trg_sys_action']: d, i, k, p = da.split('-') if i in [ 'inform', 'recommend', 'offerbook', 'offerbooked' ] and k == slot: break else: # not answer request reward -= 1 if not turn_data['trg_sys_action']: reward -= 5 r.append(reward) t.append(0) torch.save((s, a, r, next_s, t), file_dir)
def evaluate(self, N): logging.info('eval: user 2 system') env = self.env_list[0] traj_len = 40 turn_tot, inform_tot, match_tot, success_tot = [], [], [], [] for seed in range(N): s = env.reset(seed) print('seed', seed) print('origin goal', env.goal) print('goal', env.evaluator.goal) for t in range(traj_len): s_vec = torch.Tensor( state_vectorize_user( s, env.cfg, env.evaluator.cur_domain)).to(device=DEVICE) # mode with policy during evaluation a = self.policy_usr.select_action(s_vec, False) next_s, done = env.step_usr(s, a) next_s_vec = torch.Tensor( state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE) next_a = self.policy_sys.select_action(next_s_vec, False) s = env.step_sys(next_s, next_a) print('usr', s['user_action']) print('sys', s['sys_action']) if done: break turn_tot.append(env.time_step // 2) match_tot += env.evaluator.match_rate(aggregate=False) inform_tot.append(env.evaluator.inform_F1(aggregate=False)) print('turn', env.time_step // 2) match_session = env.evaluator.match_rate() print('match', match_session) inform_session = env.evaluator.inform_F1() print('inform', inform_session) if (match_session == 1 and inform_session[1] == 1) \ or (match_session == 1 and inform_session[1] is None) \ or (match_session is None and inform_session[1] == 1): print('success', 1) success_tot.append(1) else: print('success', 0) success_tot.append(0) logging.info('turn {}'.format(np.mean(turn_tot))) logging.info('match {}'.format(np.mean(match_tot))) TP, FP, FN = np.sum(inform_tot, 0) prec = TP / (TP + FP) rec = TP / (TP + FN) F1 = 2 * prec * rec / (prec + rec) logging.info('inform rec {}, F1 {}'.format(rec, F1)) logging.info('success {}'.format(np.mean(success_tot)))
def sampler(pid, queue, evt, env, policy_usr, policy_sys, batchsz): """ This is a sampler function, and it will be called by multiprocess.Process to sample data from environment by multiple processes. 随机生成batchsz数量的对话数据, 并记录每一轮的状态和奖励, 这个对话数据是通过系统生成的,由policy和env共同维护实现 :param pid: process id :param queue: multiprocessing.Queue, to collect sampled data :param evt: multiprocessing.Event, to keep the process alive :param env: environment instance :param policy: policy network, to generate action from current policy :param batchsz: total sampled items :return: """ buff = Memory() # we need to sample batchsz of (state, action, next_state, reward, mask) # each trajectory contains `trajectory_len` num of items, so we only need to sample # `batchsz//trajectory_len` num of trajectory totally # the final sampled number may be larger than batchsz. sampled_num = 0 sampled_traj_num = 0 traj_len = 40 real_traj_len = 0 # sampled_num 表示对话轮数 while sampled_num < batchsz: # for each trajectory, we reset the env and get initial state # 初始化状态 s = env.reset() # 完成一个trajectory for t in range(traj_len): # [s_dim_usr] => [a_dim_usr] s_vec = torch.Tensor( state_vectorize_user(s, env.cfg, env.evaluator.cur_domain)) # 选择动作 a = policy_usr.select_action(s_vec.to(device=DEVICE)).cpu() # interact with env, done is a flag indicates ending or not next_s, done = env.step_usr(s, a) # [s_dim] => [a_dim] next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db)) next_a = policy_sys.select_action( next_s_vec.to(device=DEVICE)).cpu() # interact with env s = env.step_sys(next_s, next_a) # get reward compared to demonstrations if done: env.set_rollout(True) # 貌似是清理env做的操作 s_vec_next = torch.Tensor( state_vectorize_user(s, env.cfg, env.evaluator.cur_domain)) a_next = torch.zeros_like(a) next_s_next, _ = env.step_usr(s, a_next) next_s_vec_next = torch.Tensor( state_vectorize(next_s_next, env.cfg, env.db)) env.set_rollout(False) r_usr = 20 if env.evaluator.inform_F1( ans_by_sys=False)[1] == 1. else -5 r_sys = 20 if env.evaluator.task_success(False) else -5 r_global = 20 if env.evaluator.task_success() else -5 else: # one step roll out env.set_rollout(True) s_vec_next = torch.Tensor( state_vectorize_user(s, env.cfg, env.evaluator.cur_domain)) a_next = policy_usr.select_action( s_vec_next.to(device=DEVICE)).cpu() next_s_next, _ = env.step_usr(s, a_next) next_s_vec_next = torch.Tensor( state_vectorize(next_s_next, env.cfg, env.db)) env.set_rollout(False) r_usr = 0 if not s['user_action']: # 上一轮没有用户动作 r_usr -= 5 if env.evaluator.cur_domain: for da in s['user_action']: d, i, k = da.split('-') if i == 'request': for slot, value in s['goal_state'][d].items(): if value != '?' and slot in s['user_goal'][d] \ and s['user_goal'][d][slot] != '?': # request before express constraint r_usr -= 1 r_sys = 0 if not next_s['sys_action']: # 上一轮没有系统动作 r_sys -= 5 if env.evaluator.cur_domain: for slot, value in next_s['belief_state'][ env.evaluator.cur_domain].items(): if value == '?': for da in next_s['sys_action']: d, i, k, p = da.split('-') if i in [ 'inform', 'recommend', 'offerbook', 'offerbooked' ] and k == slot: break else: # not answer request # 重要有一个slot没有回答,就抠一点奖励 r_sys -= 1 r_global = 5 if env.evaluator.cur_domain and env.evaluator.domain_success( env.evaluator.cur_domain) else -1 # save to queue # 训练数据导入到Memory中 buff.push(s_vec.numpy(), a.numpy(), r_usr, s_vec_next.numpy(), next_s_vec.numpy(), next_a.numpy(), r_sys, next_s_vec_next.numpy(), done, r_global) # update per step real_traj_len = t if done: break # this is end of one trajectory sampled_num += real_traj_len sampled_traj_num += 1 # t indicates the valid trajectory length # this is end of sampling all batchsz of items. # when sampling is over, push all buff data into queue queue.put([pid, buff]) evt.wait()
def sampler(pid, queue, evt, env, policy, batchsz, human_reward): """ This is a sampler function, and it will be called by multiprocess.Process to sample data from environment by multiple processes. :param pid: process id :param queue: multiprocessing.Queue, to collect sampled data :param evt: multiprocessing.Event, to keep the process alive :param env: environment instance :param policy: policy network, to generate action from current policy :param batchsz: total sampled items :return: """ buff = Memory() # human_reward = human_reward() # we need to sample batchsz of (state, action, next_state, reward, mask) # each trajectory contains `trajectory_len` num of items, so we only need to sample # `batchsz//trajectory_len` num of trajectory totally # the final sampled number may be larger than batchsz. sampled_num = 0 sampled_traj_num = 0 traj_len = 40 real_traj_len = 0 while sampled_num < batchsz: # for each trajectory, we reset the env and get initial state s = env.reset() for t in range(traj_len): # [s_dim] => [a_dim] s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)) a = policy.select_action(s_vec.to(device=DEVICE)).cpu() # print(a.shape) d = torch.Tensor(domain_vectorize(s, env.cfg)) # interact with env next_s, done = env.step(s, a) # a flag indicates ending or not mask = 0 if done else 1 # get reward compared to demostrations next_s_vec = torch.Tensor(state_vectorize(next_s, env.cfg, env.db)) r = human_reward.reward_human(next_s, done) # save to queue buff.push(s_vec.numpy(), a.numpy(), mask, next_s_vec.numpy(), r, d.numpy()) # update per step s = next_s real_traj_len = t if done: break # this is end of one trajectory sampled_num += real_traj_len sampled_traj_num += 1 # t indicates the valid trajectory length # this is end of sampling all batchsz of items. # when sampling is over, push all buff data into queue queue.put([pid, buff]) evt.wait()
def evaluate(self, save_dialog=False): self.policy.eval() if save_dialog: with open('./data/goal.json', 'r') as f: saved_goal_list = json.load(f) collected_dialog = [] env = self.env_list[0] traj_len = 40 reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], [] for seed in range(1000): dialog_list = [] if save_dialog: s = env.reset(seed, saved_goal=saved_goal_list[seed]) else: s = env.reset(seed, saved_goal=None) # print('seed', seed) # print('goal', env.goal.domain_goals) # print('usr', s['user_action']) dialog_list.append(s['user_action']) turn = traj_len reward = [] value = [] mask = [] for t in range(traj_len): s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE) # mode with policy during evaluation a = self.policy.select_action(s_vec, False) next_s, done = env.step(s, a.cpu()) next_s_vec = torch.Tensor( state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE) r = self.human_reward.reward_human(next_s, done) reward.append(r) s = next_s dialog_list.append(s['last_sys_action']) dialog_list.append(s['user_action']) # print('sys', s['last_sys_action']) # print('usr', s['user_action']) if done: mask.append(0) turn = t + 2 # one due to counting from 0, the one for the last turn break mask.append(1) reward_tot.append(np.mean(reward)) turn_tot.append(turn) match_tot += self.evaluator.match_rate(s) inform_tot.append(self.evaluator.inform_F1(s)) reward = torch.Tensor(reward) mask = torch.LongTensor(mask) # print('turn', turn) match_session = self.evaluator.match_rate(s, True) # print('match', match_session) inform_session = self.evaluator.inform_F1(s, True) # print('inform', inform_session) if (match_session == 1 and inform_session[1] == 1) \ or (match_session == 1 and inform_session[1] is None) \ or (match_session is None and inform_session[1] == 1): # print('success', 1) success_tot.append(1) else: # print('success', 0) success_tot.append(0) dialog_dict = { 'goal id': seed, 'goal': env.goal.domain_goals, 'dialog': dialog_list, 'turn': turn, 'status': success_tot[-1] } collected_dialog.append(dialog_dict) logging.info('reward {}'.format(np.mean(reward_tot))) logging.info('turn {}'.format(np.mean(turn_tot))) logging.info('match {}'.format(np.mean(match_tot))) TP, FP, FN = np.sum(inform_tot, 0) prec = TP / (TP + FP) rec = TP / (TP + FN) F1 = 2 * prec * rec / (prec + rec) logging.info('inform rec {}, F1 {}'.format(rec, F1)) logging.info('success {}'.format(np.mean(success_tot))) if save_dialog: self.save_dialog(self.save_dir, collected_dialog)
def create_dataset_global(self, part, file_dir, data_dir, cfg, db): """ 创建global数据,这个数据记录了用户侧和系统侧的所有状态以及奖励 """ datas = self.data[part] goals = self.goal[part] s_usr, s_sys, r_g, next_s_usr, next_s_sys, t = [], [], [], [], [], [] evaluator = MultiWozEvaluator(data_dir, cfg.d) for idx, turn_data in enumerate(datas): if turn_data['others']['turn'] % 2 == 0: if turn_data['others']['turn'] == 0: current_goal = goals[turn_data['others']['session_id']] evaluator.add_goal(current_goal) else: next_s_usr.append(s_usr[-1]) # 当用户目标无法满足时,切换用户目标 if turn_data['others']['change'] and evaluator.cur_domain: if 'final' in current_goal[evaluator.cur_domain]: for key in current_goal[evaluator.cur_domain]['final']: current_goal[ evaluator.cur_domain][key] = current_goal[ evaluator.cur_domain]['final'][key] del (current_goal[evaluator.cur_domain]['final']) turn_data['user_goal'] = deepcopy(current_goal) s_usr.append( torch.Tensor( state_vectorize_user(turn_data, cfg, evaluator.cur_domain))) evaluator.add_usr_da(turn_data['trg_user_action']) if turn_data['others']['terminal']: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = turn_data[ 'trg_user_action'] next_turn_data['sys_action'] = datas[idx + 1]['trg_sys_action'] next_turn_data['trg_user_action'] = {} next_turn_data['goal_state'] = datas[idx + 1]['final_goal_state'] next_s_usr.append( torch.Tensor( state_vectorize_user(next_turn_data, cfg, evaluator.cur_domain))) else: if turn_data['others']['turn'] != 1: next_s_sys.append(s_sys[-1]) s_sys.append( torch.Tensor(state_vectorize(turn_data, cfg, db, True))) evaluator.add_sys_da(turn_data['trg_sys_action']) if turn_data['others']['terminal']: next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = {} next_turn_data['sys_action'] = turn_data['trg_sys_action'] next_turn_data['trg_sys_action'] = {} next_turn_data['belief_state'] = turn_data[ 'final_belief_state'] next_s_sys.append( torch.Tensor( state_vectorize(next_turn_data, cfg, db, True))) # 由于多轮对话系统,默认最终都是系统说结束语,因此通过系统判断任务是否成功作为整体的奖励 reward_g = 20 if evaluator.task_success() else -5 r_g.append(reward_g) t.append(1) else: # 增加domain_success的奖励,其他的则每增加一轮减少一点损失,用于缩短轮数 todo 什么是 domain_success reward_g = 5 if evaluator.cur_domain and evaluator.domain_success( evaluator.cur_domain) else -1 r_g.append(reward_g) t.append(0) torch.save((s_usr, s_sys, r_g, next_s_usr, next_s_sys, t), file_dir)
def create_dataset_sys(self, part, file_dir, data_dir, cfg, db): """ 创建sys的训练数据 """ datas = self.data[part] goals = self.goal[part] # 系统状态+系统动作+回报+上一轮系统状态+末轮标志位 s, a, r, next_s, t = [], [], [], [], [] # evaluator 全称记录数据 evaluator = MultiWozEvaluator(data_dir, cfg.d) for idx, turn_data in enumerate(datas): # user # 用户侧并没有做数据的更新操作 if turn_data['others']['turn'] % 2 == 0: # 首轮对话加载用户目标 if turn_data['others']['turn'] == 0: evaluator.add_goal( goals[turn_data['others']['session_id']]) # evaluator.add_usr_da(turn_data['trg_user_action']) continue # 错位了,确实表示的下一轮状态 if turn_data['others']['turn'] != 1: next_s.append(s[-1]) # 将当前数据转化为状态向量 s.append(torch.Tensor(state_vectorize(turn_data, cfg, db, True))) # 将当前动作转化为动作向量 a.append( torch.Tensor(action_vectorize(turn_data['trg_sys_action'], cfg))) evaluator.add_sys_da(turn_data['trg_sys_action']) if turn_data['others']['terminal']: # 结束轮 next_turn_data = deepcopy(turn_data) next_turn_data['others']['turn'] = -1 next_turn_data['user_action'] = {} next_turn_data['sys_action'] = turn_data['trg_sys_action'] next_turn_data['trg_sys_action'] = {} next_turn_data['belief_state'] = turn_data[ 'final_belief_state'] # 统计next_s next_s.append( torch.Tensor(state_vectorize(next_turn_data, cfg, db, True))) # 统计奖励, 对于系统动作,判决任务是否完成作为最终奖励依据, # 系统是否完成了真实用户动作所提出的订阅请求,且系统是否回答了真实用户动作所咨询的所有问题 reward = 20 if evaluator.task_success(False) else -5 r.append(reward) # 结束标志位 t.append(1) else: reward = 0 if evaluator.cur_domain: for slot, value in turn_data['belief_state'][ evaluator.cur_domain].items(): if value == '?': for da in turn_data['trg_sys_action']: d, i, k, p = da.split('-') if i in [ 'inform', 'recommend', 'offerbook', 'offerbooked' ] and k == slot: break else: # not answer request # 没有完成对belief_state中的提问,奖励减一 reward -= 1 if not turn_data['trg_sys_action']: # 本轮没有回复奖励减五 reward -= 5 r.append(reward) t.append(0) torch.save((s, a, r, next_s, t), file_dir)
def evaluate(self): env = self.env_list[0] traj_len = 40 reward_tot, turn_tot, inform_tot, match_tot, success_tot = [], [], [], [], [] for seed in range(1000): s = env.reset(seed) print('seed', seed) print('goal', env.goal.domain_goals) print('usr', s['user_action']) turn = traj_len reward = [] value = [] mask = [] for t in range(traj_len): s_vec = torch.Tensor(state_vectorize(s, env.cfg, env.db)).to(device=DEVICE) # mode with policy during evaluation a = self.policy.select_action(s_vec, False) next_s, done = env.step(s, a.cpu()) next_s_vec = torch.Tensor( state_vectorize(next_s, env.cfg, env.db)).to(device=DEVICE) log_pi = self.policy.get_log_prob(s_vec, a) r = self.rewarder.estimate(s_vec, a, next_s_vec, log_pi) v = self.value(s_vec).squeeze(-1) reward.append(r.item()) value.append(v.item()) s = next_s print('sys', s['last_sys_action']) print('usr', s['user_action']) if done: mask.append(0) turn = t + 2 # one due to counting from 0, the one for the last turn break mask.append(1) reward_tot.append(np.mean(reward)) turn_tot.append(turn) match_tot += self.evaluator.match_rate(s) inform_tot.append(self.evaluator.inform_F1(s)) reward = torch.Tensor(reward) value = torch.Tensor(value) mask = torch.LongTensor(mask) A_sa, v_target = self.est_adv(reward, value, mask) print('turn', turn) #print('reward', A_sa.tolist()) print('reward', v_target[0].item()) match_session = self.evaluator.match_rate(s, True) print('match', match_session) inform_session = self.evaluator.inform_F1(s, True) print('inform', inform_session) if (match_session == 1 and inform_session[1] == 1) \ or (match_session == 1 and inform_session[1] is None) \ or (match_session is None and inform_session[1] == 1): print('success', 1) success_tot.append(1) else: print('success', 0) success_tot.append(0) logging.info('reward {}'.format(np.mean(reward_tot))) logging.info('turn {}'.format(np.mean(turn_tot))) logging.info('match {}'.format(np.mean(match_tot))) TP, FP, FN = np.sum(inform_tot, 0) prec = TP / (TP + FP) rec = TP / (TP + FN) F1 = 2 * prec * rec / (prec + rec) logging.info('inform rec {}, F1 {}'.format(rec, F1)) logging.info('success {}'.format(np.mean(success_tot)))