def eval_simulator_performance(data, goal_type=None): begin_da_predict_golden = [] state_da_predict_golden = [] state_predict_golden = [] simulator = Simulator() for task_id, item in data.items(): if goal_type and item['type']!=goal_type: continue for i, turn in enumerate(item['messages']): if turn['role']=='usr': if i==0: simulator.init_session(goal=item['goal']) begin_da_predict_golden.append({ 'predict': simulator.begin_da(), 'golden': turn['dialog_act'] }) else: last_turn = item['messages'][i - 2] usr_da = item['messages'][i - 2]['dialog_act'] sys_da = item['messages'][i - 1]['dialog_act'] simulator.init_session(goal=item['goal'], state=deepcopy(last_turn['user_state'])) simulator.state_update(prev_user_da=usr_da, prev_sys_da=sys_da) cur_da = simulator.state_predict() new_state = deepcopy(simulator.state) state_da_predict_golden.append({ 'predict': cur_da, 'golden': turn['dialog_act'] }) state_predict_golden.append({ 'predict': new_state, 'golden': turn['user_state'] }) print('begin da', calculateF1(begin_da_predict_golden)) print('state da', calculateF1(state_da_predict_golden)) print('all da', calculateF1(begin_da_predict_golden+state_da_predict_golden)) print('joint state', calculateJointState(state_predict_golden)) print('slot state', calculateSlotState(state_predict_golden))
def eval_state_predict(data): def state_update(prev_state, cur_state): update = [] for prev_ele, cur_ele in zip(prev_state, cur_state): if cur_ele != prev_ele: update.append(cur_ele) id = 1 for ele in cur_state[::-1]: if ele[-1]: id = ele[0] break return update, id simulator = Simulator() for task_id, item in data.items(): for i, turn in enumerate(item['messages']): if turn['role']=='usr' and i > 0: last_turn = item['messages'][i-2] usr_da = item['messages'][i-2]['dialog_act'] sys_da = item['messages'][i-1]['dialog_act'] simulator.init_session(goal=item['goal'],state=deepcopy(last_turn['user_state'])) simulator.state_update(prev_user_da=usr_da, prev_sys_da=sys_da) cur_da = simulator.state_predict() new_state = simulator.state # print('old state:') # pprint(last_turn['user_state']) # if 'NoOffer' in [x[0] for x in item['messages'][i-1]['dialog_act']]: print(item['messages'][i-2]['content']) print(item['messages'][i-1]['content']) print(turn['content']) print('usr da') pprint(usr_da) print('sys da') pprint(sys_da) print('predict state update:') pprint(state_update(last_turn['user_state'], new_state)) print('golden state:') pprint(state_update(last_turn['user_state'], turn['user_state'])) print('predict usr da') pprint(cur_da) print('golden usr da') pprint(turn['dialog_act']) print('-'*100)
def end2end_evaluate_simulation(policy): nlu = BERTNLU('all', 'crosswoz_all_context.json', None) nlg_usr = TemplateNLG(is_user=True, mode='auto_manual') nlg_sys = TemplateNLG(is_user=False, mode='auto_manual') # nlg_usr = SCLSTM(is_user=True, use_cuda=False) # nlg_sys = SCLSTM(is_user=False, use_cuda=False) usr_policy = Simulator() usr_agent = PipelineAgent(nlu, None, usr_policy, nlg_usr, name='usr') sys_policy = policy sys_dst = RuleDST() sys_agent = PipelineAgent(nlu, sys_dst, sys_policy, nlg_sys, name='sys') sess = BiSession(sys_agent=sys_agent, user_agent=usr_agent) task_success = { 'All': list(), '单领域': list(), '独立多领域': list(), '独立多领域+交通': list(), '不独立多领域': list(), '不独立多领域+交通': list() } simulate_sess_num = 100 repeat = 5 random_seed = 2019 random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) random_seeds = [ random.randint(1, 2**32 - 1) for _ in range(simulate_sess_num * repeat * 10000) ] while True: sys_response = '' random_seed = random_seeds[0] random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) random_seeds.pop(0) sess.init_session() # print(usr_policy.goal_type) if len(task_success[ usr_policy.goal_type]) == simulate_sess_num * repeat: continue for i in range(15): sys_response, user_response, session_over, reward = sess.next_turn( sys_response) # print('user:'******'sys:', sys_response) # print(session_over, reward) # print() if session_over is True: task_success['All'].append(1) task_success[usr_policy.goal_type].append(1) break else: task_success['All'].append(0) task_success[usr_policy.goal_type].append(0) print([len(x) for x in task_success.values()]) # print(min([len(x) for x in task_success.values()])) if len(task_success['All']) % 100 == 0: for k, v in task_success.items(): print(k) for i in range(repeat): samples = v[i * simulate_sess_num:(i + 1) * simulate_sess_num] print(sum(samples), len(samples), (sum(samples) / len(samples)) if len(samples) else 0) print('avg', (sum(v) / len(v)) if len(v) else 0) if min([len(x) for x in task_success.values()]) == simulate_sess_num * repeat: break # pprint(usr_policy.original_goal) # pprint(task_success) print('task_success') for k, v in task_success.items(): print(k) for i in range(repeat): samples = v[i * simulate_sess_num:(i + 1) * simulate_sess_num] print(sum(samples), len(samples), (sum(samples) / len(samples)) if len(samples) else 0) print('avg', (sum(v) / len(v)) if len(v) else 0)
def da_evaluate_simulation(policy): usr_policy = Simulator() usr_agent = PipelineAgent(None, None, usr_policy, None, name='usr') sys_policy = policy sys_dst = RuleDST() sys_agent = PipelineAgent(None, sys_dst, sys_policy, None, name='sys') sess = BiSession(sys_agent=sys_agent, user_agent=usr_agent) task_success = { 'All': list(), '单领域': list(), '独立多领域': list(), '独立多领域+交通': list(), '不独立多领域': list(), '不独立多领域+交通': list() } simulate_sess_num = 100 repeat = 5 random_seed = 2019 random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) random_seeds = [ random.randint(1, 2**32 - 1) for _ in range(simulate_sess_num * repeat * 10000) ] while True: sys_response = [] random_seed = random_seeds[0] random.seed(random_seed) np.random.seed(random_seed) torch.manual_seed(random_seed) random_seeds.pop(0) sess.init_session() # print(usr_policy.goal_type) if len(task_success[ usr_policy.goal_type]) == simulate_sess_num * repeat: continue for i in range(15): sys_response, user_response, session_over, reward = sess.next_turn( sys_response) # print('user:'******'sys:', sys_response) # print(session_over, reward) # print() if session_over is True: # pprint(sys_agent.tracker.state) task_success['All'].append(1) task_success[usr_policy.goal_type].append(1) break else: task_success['All'].append(0) task_success[usr_policy.goal_type].append(0) print([len(x) for x in task_success.values()]) # print(min([len(x) for x in task_success.values()])) if len(task_success['All']) % 100 == 0: for k, v in task_success.items(): print(k) for i in range(repeat): samples = v[i * simulate_sess_num:(i + 1) * simulate_sess_num] print(sum(samples), len(samples), (sum(samples) / len(samples)) if len(samples) else 0) print('avg', (sum(v) / len(v)) if len(v) else 0) if min([len(x) for x in task_success.values()]) == simulate_sess_num * repeat: break # pprint(usr_policy.original_goal) # pprint(task_success) print('task_success') for k, v in task_success.items(): print(k) for i in range(repeat): samples = v[i * simulate_sess_num:(i + 1) * simulate_sess_num] print(sum(samples), len(samples), (sum(samples) / len(samples)) if len(samples) else 0) print('avg', (sum(v) / len(v)) if len(v) else 0)