Example #1
0
cdir = "sessions/" + str(uid) + '_' + datetime.datetime.now().strftime(
    '%Y-%m-%d_%H-%M-%S') + "/"
if not os.path.exists(cdir):
    os.makedirs(cdir)
with open(os.path.join(cdir, 'credentials'), 'w') as f:
    f.write(uname)
try:
    for i in range(N):
        print "-" * 200 + "\n第{0}次对话:".format(i)
        dia = []
        curr_agent = agent
        dia.append(curr_agent)
        dialog_manager = DialogManager(curr_agent,
                                       user_sim,
                                       db_full,
                                       db_inc,
                                       movie_kb,
                                       verbose=False)
        utt = dialog_manager.initialize_episode()
        dia.append(copy.deepcopy(utt))
        total_reward = 0
        while (True):
            episode_over, reward, utt, agact = dialog_manager.next_turn()
            dia.append(agact)
            dia.append(copy.deepcopy(utt))
            total_reward += reward
            if episode_over:
                break
        pkl.dump(dia, open(cdir + str(i) + ".p", 'w'))
except KeyboardInterrupt:
    sys.exit()
Example #2
0
user_sim.set_nlg_model(nlg_model)

################################################################################
# load trained NLU model
################################################################################
nlu_model_path = params['nlu_model_path']
nlu_model = nlu()
nlu_model.load_nlu_model(nlu_model_path)

agent.set_nlu_model(nlu_model)
user_sim.set_nlu_model(nlu_model)

################################################################################
# Dialog Manager
################################################################################
dialog_manager = DialogManager(agent, user_sim, act_set, slot_set, kb)

################################################################################
#   Run num_episodes Conversation Simulations
################################################################################
status = {'successes': 0, 'count': 0, 'cumulative_reward': 0}

simulation_epoch_size = params['simulation_epoch_size']
batch_size = params['batch_size']  # default = 16
warm_start = params['warm_start']
warm_start_epochs = params['warm_start_epochs']

success_rate_threshold = params['success_rate_threshold']
save_check_point = params['save_check_point']
""" Best Model and Performance Records """
best_model = {}
class AgentAdverserialA2C(Agent):
	def __init__(self, movie_dict=None, act_set=None, slot_set=None, params=None):

		## parameters associated with dialogue action and slot filling
		self.movie_dict = movie_dict
		self.act_set = act_set
		self.slot_set = slot_set
		self.act_cardinality = len(act_set.keys())
		self.slot_cardinality = len(slot_set.keys())

		self.feasible_actions = dialog_config.feasible_actions
		self.num_actions = len(self.feasible_actions)

		# rl specific parameters
		# epsilon:
		self.params = params
		self.epsilon = params['epsilon']
		#
		self.agent_run_mode = params['agent_run_mode']
		#
		self.agent_act_level = params['agent_act_level']
		# experience replay
		# self.experience_replay_pool_size = params.get('experience_replay_pool_size', 1000)
		# self.experience_replay_pool = [] #Replay_Memory(self.experience_replay_pool_size)
		self.hidden_size = params.get('dqn_hidden_size', 60)
		# gamma : discount factor
		self.gamma = params.get('gamma', 0.99)
		self.predict_mode = params.get('predict_mode', False)
		self.actor_lr = params.get('actor_lr', 0.0005)
		self.critic_lr = params.get('critic_lr', 0.001)
		self.gan_critic_lr = params.get('gan_critic_lr', 0.001)
		self.discriminator_lr = params.get('discriminator_lr', 0.0005)
		self.discriminator_batch_size = params.get('discriminator_batch_size', 1)
		self.expert_path = params["expert_path"]
                self.reg_cost = self.params.get('reg_cost', 1e-3)

		## warm start:
		## there is no warm start since there are is no experience replay
		# self.warm_start = params.get('warm_start', 0)

		self.max_turn = params['max_turn'] + 4
		self.state_dimension = 2 * self.act_cardinality + 7 * self.slot_cardinality + 3 + self.max_turn
                self.expert_weights = params['expert_weights']
		# Build models
		self.build_actor_model(self.actor_lr)
		self.build_critic_model(self.critic_lr)
		self.build_critic_model(self.gan_critic_lr, True)
		self.build_discriminator(self.gan_critic_lr)
		self.n = params.get('n', 50)

		## load a model if present
		if params['trained_model_path'] != None:
			self.load(params['trained_actor_model_path'], "actor")
			self.load(params['trained_critic_model_path'], "critic")
			self.load(params['trained_adversarial_critic_model_path'], "advesarial_critic")
			self.load(params['trained_discriminator_model_path'], "discriminator")
			self.predict_mode = True
			self.warm_start = 2
		#self.expert = DQN(self.state_dimension, self.hidden_size, self.hidden_size, self.num_actions)
		self.expert = self.build_expert_model()
		# self.clone_dqn = copy.deepcopy(self.expert)
		# self.clone_dqn = keras.models.clone_model(self.expert)
		self.cur_bellman_err = 0
			
		# Prediction Mode: load trained DQN model
                if params['expert_path'] != None:
		    # self.dqn.model = model_from_json(params['expert_path'])
                    # copy.deepcopy(self.load_trained_DQN(params['expert_path']))
		    # self.dqn.model.load_weights(params['expert_weights'])
		    self.predict_mode = True
		    self.warm_start = 2
                user_sim = RuleSimulator(params['movie_dictionary'], 
                        params['act_set'], 
                        params['slot_set'], 
                        params['goal_set'], 
                        params['usersim_params'])
                self.dialog_manager = DialogManager(self.expert, 
                        user_sim, 
                        params['act_set'], 
                        params['slot_set'], 
                        params['movie_kb'])
                user_sim.set_nlg_model(params['nlg'])
                user_sim.set_nlu_model(params['nlu'])

	def load(self, name, model_name):
		if model_name == "actor":
			self.actor_model.load(name)
		elif model_name == "critic":
			self.critic_model.load(name)
		elif model_name == "advesarial_critic":
			self.adversarial_critic_model.load(name)
		elif model_name == "discriminator":
			self.discriminator.load(name)

	def save(self, name, model_name):
		if model_name == "actor":
			self.actor_model.save_weights(name)
		elif model_name == "critic":
			self.critic_model.save_weights(name)
		elif model_name == "advesarial_critic":
			self.adversarial_critic_model.save_weights(name)
		elif model_name == "discriminator":
			self.discriminator.save_weights(name)
			self.critic_model.save_weights(name)

	def load_actor_model(self, model_config_path, lr):
		with open(model_config_path, 'r') as f:
			model = keras.models.model_from_json(f.read())
		model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr))
		self.actor_model = model

	def build_expert_model(self):
                model = Sequential()
		fc1 = Dense(self.hidden_size, input_shape=(self.state_dimension,), activation='relu',
					kernel_initializer=VarianceScaling(mode='fan_avg', distribution='normal'),
					kernel_regularizer=regularizers.l2(self.reg_cost))
		fc2 = Dense(self.hidden_size, activation='relu',
					kernel_initializer=VarianceScaling(mode='fan_avg', distribution='normal'),
					kernel_regularizer=regularizers.l2(self.reg_cost))

		fc3 = Dense(self.num_actions, activation='linear',
					kernel_initializer=VarianceScaling(mode='fan_avg', distribution='normal'),
					kernel_regularizer=regularizers.l2(self.reg_cost))
		model.add(fc1)
		model.add(fc2)
		model.add(fc3)
                #self.expert.model_from_json(self.expert_path)
                model.load_weights(self.expert_weights)
                return model
        
	def load_trained_DQN(self, path):
	    """ Load the trained DQN from a file """
	    
	    #trained_file = pickle.load(open(path, 'rb'))
	    #model = trained_file['model']
	    model = self.expert.load_weights(path)
	    print "trained DQN Parameters:", json.dumps(trained_file['params'], indent=2)
	    return model 
	
        def build_actor_model(self, actor_lr):
		model = Sequential()
		fc1 = Dense(50, input_shape=(self.state_dimension,), 
			activation='relu',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), kernel_regularizer=regularizers.l2(0.01))
		fc2 = Dense(50, activation='relu',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), kernel_regularizer=regularizers.l2(0.01))
		fc3 = Dense(self.num_actions, activation='softmax',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), kernel_regularizer=regularizers.l2(0.01))
		model.add(fc1)
		model.add(fc2)
		model.add(fc3)
		model.compile(loss='mse', optimizer=Adam(lr=self.actor_lr))
		self.actor_model = model

	def build_critic_model(self, critic_lr, is_adverserial = False):
		model = Sequential()
		fc1 = Dense(50, input_shape=(self.state_dimension,), 
                        activation='relu',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), 
                        kernel_regularizer=regularizers.l2(0.01))
		fc2 = Dense(50, activation='relu',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), 
                        kernel_regularizer=regularizers.l2(0.01))
		fc3 = Dense(1, activation='relu',
			kernel_initializer=VarianceScaling(mode='fan_avg',
			distribution='normal'), 
                        kernel_regularizer=regularizers.l2(0.01))
		model.add(fc1)
		model.add(fc2)
		model.add(fc3)
		model.compile(loss='mse', optimizer=Adam(lr=self.critic_lr))
		if is_adverserial:
			self.adversarial_critic_model = model
		else:
			self.critic_model = model

	def build_discriminator(self, discriminator_lr):
		model = Sequential()
		fc1 = Dense(50, input_shape=(self.state_dimension + self.num_actions ,), activation='relu',
					kernel_initializer=VarianceScaling(mode='fan_avg',
													   distribution='normal'))
		fc2 = Dense(50, activation='relu',
					kernel_initializer=VarianceScaling(mode='fan_avg',
													   distribution='normal'))
		fc3 = Dense(1, activation='sigmoid',
					kernel_initializer=VarianceScaling(mode='fan_avg',
													   distribution='normal'))
		model.add(fc1)
		model.add(fc2)
		model.add(fc3)
		model.compile(optimizer=Adam(lr=self.discriminator_lr) , loss='binary_crossentropy', metrics=['accuracy'])
		self.discriminator = model

	def initialize_episode(self):
		""" Initialize a new episode. This function is called every time a new episode is run. """

		self.current_slot_id = 0
		self.phase = 0
		self.request_set = ['moviename', 'starttime', 'city', 'date', 'theater', 'numberofpeople']

	def prepare_state_representation(self, state):
		""" Create the representation for each state """

		user_action = state['user_action']
		current_slots = state['current_slots']
		kb_results_dict = state['kb_results_dict']
		agent_last = state['agent_action']

		########################################################################
		#   Create one-hot of acts to represent the current user action
		########################################################################
		user_act_rep = np.zeros((1, self.act_cardinality))
		user_act_rep[0, self.act_set[user_action['diaact']]] = 1.0

		########################################################################
		#     Create bag of inform slots representation to represent the current user action
		########################################################################
		user_inform_slots_rep = np.zeros((1, self.slot_cardinality))
		for slot in user_action['inform_slots'].keys():
			user_inform_slots_rep[0, self.slot_set[slot]] = 1.0

		########################################################################
		#   Create bag of request slots representation to represent the current user action
		########################################################################
		user_request_slots_rep = np.zeros((1, self.slot_cardinality))
		for slot in user_action['request_slots'].keys():
			user_request_slots_rep[0, self.slot_set[slot]] = 1.0

		########################################################################
		#   Creat bag of filled_in slots based on the current_slots
		########################################################################
		current_slots_rep = np.zeros((1, self.slot_cardinality))
		for slot in current_slots['inform_slots']:
			current_slots_rep[0, self.slot_set[slot]] = 1.0

		########################################################################
		#   Encode last agent act
		########################################################################
		agent_act_rep = np.zeros((1, self.act_cardinality))
		if agent_last:
			agent_act_rep[0, self.act_set[agent_last['diaact']]] = 1.0

		########################################################################
		#   Encode last agent inform slots
		########################################################################
		agent_inform_slots_rep = np.zeros((1, self.slot_cardinality))
		if agent_last:
			for slot in agent_last['inform_slots'].keys():
				agent_inform_slots_rep[0, self.slot_set[slot]] = 1.0

		########################################################################
		#   Encode last agent request slots
		########################################################################
		agent_request_slots_rep = np.zeros((1, self.slot_cardinality))
		if agent_last:
			for slot in agent_last['request_slots'].keys():
				agent_request_slots_rep[0, self.slot_set[slot]] = 1.0

		turn_rep = np.zeros((1, 1)) + state['turn'] / 10.

		########################################################################
		#  One-hot representation of the turn count?
		########################################################################
		turn_onehot_rep = np.zeros((1, self.max_turn))
		turn_onehot_rep[0, state['turn']] = 1.0

		########################################################################
		#   Representation of KB results (scaled counts)
		########################################################################
		kb_count_rep = np.zeros((1, self.slot_cardinality + 1)) + kb_results_dict['matching_all_constraints'] / 100.
		for slot in kb_results_dict:
			if slot in self.slot_set:
				kb_count_rep[0, self.slot_set[slot]] = kb_results_dict[slot] / 100.

		########################################################################
		#   Representation of KB results (binary)
		########################################################################
		kb_binary_rep = np.zeros((1, self.slot_cardinality + 1)) + np.sum(
			kb_results_dict['matching_all_constraints'] > 0.)
		for slot in kb_results_dict:
			if slot in self.slot_set:
				kb_binary_rep[0, self.slot_set[slot]] = np.sum(kb_results_dict[slot] > 0.)

		self.final_representation = np.hstack(
			[user_act_rep, user_inform_slots_rep, user_request_slots_rep, agent_act_rep, agent_inform_slots_rep,
			 agent_request_slots_rep, current_slots_rep, turn_rep, turn_onehot_rep, kb_binary_rep, kb_count_rep])
		self.final_representation = np.squeeze(self.final_representation)
		return self.final_representation

	def state_to_action(self, state):
		""" A2C: Input state, output action """
		## Dialogue manager calls this to fill the experience buffer ##
		## TODO: Fix this for A2C return multinomial output
		self.representation = self.prepare_state_representation(state)
		batch_size = self.representation.shape[0]
		# self.action = self.run_policy(self.representation)
		try:
			self.action = self.actor_model.predict_on_batch(
				self.representation.reshape(1, batch_size))
		except:
			ipdb.set_trace()
                self.action = self.action.squeeze(0)
                idx = np.random.choice(self.num_actions, 1, p=self.action)[0]
		act_slot_response = copy.deepcopy(self.feasible_actions[idx])
                return {'act_slot_response': act_slot_response, 'act_slot_value_response': None}, idx, self.action[idx]

	def rule_policy(self):
		""" Rule Policy """

		if self.current_slot_id < len(self.request_set):
			slot = self.request_set[self.current_slot_id]
			self.current_slot_id += 1

			act_slot_response = {}
			act_slot_response['diaact'] = "request"
			act_slot_response['inform_slots'] = {}
			act_slot_response['request_slots'] = {slot: "UNK"}
		elif self.phase == 0:
			act_slot_response = {'diaact': "inform", 'inform_slots': {'taskcomplete': "PLACEHOLDER"},
								 'request_slots': {}}
			self.phase += 1
		elif self.phase == 1:
			act_slot_response = {'diaact': "thanks", 'inform_slots': {}, 'request_slots': {}}

		return self.action_index(act_slot_response)

	def action_index(self, act_slot_response):
		""" Return the index of action """

		for (i, action) in enumerate(self.feasible_actions):
			if act_slot_response == action:
				return i
		print act_slot_response
		raise Exception("action index not found")
		return None

	def return_greedy_action(self, state_representation):
		# TODO: Fix this A2C
		state_var = variable(torch.FloatTensor(state_representation).unsqueeze(0))
		if torch.cuda.is_available():
			state_var = state_var.cuda()
		qvalues = self.actor_model.predict(np.asarray(state_var))
		action = qvalues.data.max(1)[1]
		return action[0]

	def run_policy(self, representation):
		""" epsilon-greedy policy """
		# TODO: Remove this for A2C
		if random.random() < self.epsilon:
			return random.randint(0, self.num_actions - 1)
		else:
			if self.warm_start == 1:
				## if in training mode(not prediction) fill until you cant anymore
				if len(self.experience_replay_pool) > self.experience_replay_pool_size:
					self.warm_start = 2
				return self.rule_policy()
			else:
				# return self.expert.predict(representation, {}, predict_model=True)
				return self.return_greedy_action(representation)

	def get_advantage(self, states, rewards, is_adversary = False):
		T = len(rewards)
		v_end = np.zeros(T)
		gain = np.zeros(T)
		advantage = np.zeros(T)
		# states = [self.prepare_state_representation(x) for x in states]
		for t in reversed(range(len(rewards) - 1)):
			if t + self.n >= T:
				v_end[t] = 0
			else:
				if is_adversary:
					v_end[t] = self.adversarial_critic_model.predict(
						np.asarray([states[t + self.n]]))[0]
				else:
					v_end[t] = self.critic_model.predict(
						np.asarray([states[t + self.n]]))[0]
			gain[t] = self.gamma ** self.n * v_end[t] + \
					  sum([(self.gamma ** k) * rewards[t + k] \
							   if t + k < T \
							   else self.gamma ** k * 0 \
						   for k in range(self.n)])
			if is_adversary:
				advantage[t] = gain[t] - self.adversarial_critic_model.predict(np.asarray(
					[states[t]]))[0]
			else:
				advantage[t] = gain[t] - self.critic_model.predict(np.asarray(
					[states[t]]))[0]
		return advantage, gain

	def generate_expert_episode(self):
		## TODO: Initialize expert policy as the policy that takes epsilon greedy actions in DQN trained agent
                states = []
                actions = []
                rewards = []
                done = False
                self.dialog_manager.initialize_episode()
                while(not done):
                    temp, idx, act = self.dialog_manager.next_turn()
                    episode_over = temp[0]
                    cumulative_reward += reward
		    if episode_over:
                        if reward > 0: 
                            successes += 1
                            print ("Expert episode %s: Success" % (episode))
                        else: print ("Expert episode %s: Fail" % (episode))
                        #cumulative_turns += dialog_manager.state_tracker.turn_count
                        #cumulative_turn_list.append(dialog_manager.state_tracker.turn_count)
                        #cumulative_reward_list.append(episode_reward)
                    states.append(dialog_manager.state)
                    rewards.append(reward)
                    actions.append(act)
                #indexes.append(idx)
                return states, actions, rewards

	def train(self, states, actions, rewards, indexes, gamma=0.99):
		states = [self.prepare_state_representation(x) for x in states]
		advantage, gains = self.get_advantage(states, rewards)

		advantage = advantage.reshape(-1, 1)
		actions = np.asarray(actions)

		# L(\theta) from the handout
                #act_target[np.arange(len(states)), np.array(actions)] \
                #        = (np.array(discounted_rewards) - np.array(values))
                targets = advantage #* actions
		act_target = np.zeros((len(states),self.num_actions))
                act_target[np.arange(len(states)), np.array(indexes)] \
                                    = targets.squeeze(1)
		states = np.asarray(states)
                #TODO: Check if we want to scale rewards
                rewards = np.asarray(rewards)
		tot_rewards = np.sum(rewards)

		self.actor_model.train_on_batch(states, act_target)
		self.critic_model.train_on_batch(states, gains)

		## sample from an expert episode and the current simulated episode
		## in Goodfellow's original paper, he does it k times
		## TODO - Done: Adversarial expert episide will be generated by the best DQN episodes
                expert_states, expert_actions = self.generate_expert_episode()
		sampled_expert_index = np.random.randint(0, len(expert_states))
		one_hot_expert_action = np.zeros((1, self.num_actions))
		one_hot_expert_action[:, expert_actions[sampled_expert_index]] = 1
		sampled_expert_state = np.array(expert_states[sampled_expert_index])
		sampled_expert_state = np.expand_dims(sampled_expert_state, 0)
		sampled_expert_example = np.concatenate((sampled_expert_state, one_hot_expert_action), axis=1)
		sampled_simulated_index = np.random.randint(0, len(states))
		one_hot_simulated_action = np.zeros((1, self.num_actions))
		one_hot_simulated_action[:, actions[sampled_simulated_index]] = 1
		sampled_simulated_state = states[sampled_simulated_index]
		sampled_simulated_state = np.expand_dims(sampled_simulated_state, 0)
		sampled_simulated_example = np.concatenate((sampled_simulated_state, one_hot_simulated_action), axis=1)

		## train discriminator
		d_loss_real = self.discriminator.train_on_batch(sampled_expert_example,
														np.ones((self.discriminator_batch_size, 1)))
		d_loss_fake = self.discriminator.train_on_batch(sampled_simulated_example,
														np.zeros((self.discriminator_batch_size, 1)))
		d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

		## compute gan rewards
		## call predict on a batch of the current simulated  episodes to get the class value
		state_action_pairs = []
		for s, a in zip(states, actions):
			one_hot = np.zeros(self.num_actions)
			one_hot[a] = 1
			concat_s_a = np.concatenate((s, one_hot))
			state_action_pairs.append(concat_s_a)
		probability_simulation = self.discriminator.predict(np.array(state_action_pairs))
		gan_rewards = (-np.log(1 - probability_simulation)).flatten().tolist()

		''' Train gan actor-critic network '''
		gan_values = self.compute_baseline(states, isgan=True)
		gan_discounted_rewards = self.get_value_reward(states, gan_rewards, gan_values)
		gan_act_target = np.zeros((len(states), self.num_actions))
		gan_act_target[np.arange(len(states)), np.array(actions)] = (np.array(gan_discounted_rewards)
																	 - np.array(gan_values))
		gan_critic_target = np.array(gan_discounted_rewards)
		gan_actor_loss = self.actor_model.train_on_batch(states, gan_act_target)
		gan_critic_loss = self.adversarial_critic_model.train_on_batch(states, gan_critic_target)

		advantage, gains = self.get_advantage(states, rewards, True)
		targets = advantage #* actions
		act_target = np.zeros((len(states),self.num_actions))
                act_target[np.arange(len(states)), np.array(indexes)] \
                                    = targets.squeeze(1)

		self.actor_model.train_on_batch(states, act_target)
		self.critic_model.train_on_batch(states, gains)

		return tot_rewards

	def evaluate(self, env, episode, num_episodes=100, render=False):

		cumulative_rewards = []
		for e in range(num_episodes):
			state = env.reset()
			tot_reward = 0
			while True:
				action_probs = self.actor_model.predict(np.asarray([state]))
				action = np.random.choice(np.arange(
					len(action_probs[0])), p=action_probs[0])
				state, reward, done, _ = env.step(action)
				tot_reward += reward
				if done:
					break
			cumulative_rewards.append(tot_reward)
		mean_rewards = np.mean(cumulative_rewards)
		std_rewards = np.std(cumulative_rewards)
		return mean_rewards, std_rewards
	def __init__(self, movie_dict=None, act_set=None, slot_set=None, params=None):

		## parameters associated with dialogue action and slot filling
		self.movie_dict = movie_dict
		self.act_set = act_set
		self.slot_set = slot_set
		self.act_cardinality = len(act_set.keys())
		self.slot_cardinality = len(slot_set.keys())

		self.feasible_actions = dialog_config.feasible_actions
		self.num_actions = len(self.feasible_actions)

		# rl specific parameters
		# epsilon:
		self.params = params
		self.epsilon = params['epsilon']
		#
		self.agent_run_mode = params['agent_run_mode']
		#
		self.agent_act_level = params['agent_act_level']
		# experience replay
		# self.experience_replay_pool_size = params.get('experience_replay_pool_size', 1000)
		# self.experience_replay_pool = [] #Replay_Memory(self.experience_replay_pool_size)
		self.hidden_size = params.get('dqn_hidden_size', 60)
		# gamma : discount factor
		self.gamma = params.get('gamma', 0.99)
		self.predict_mode = params.get('predict_mode', False)
		self.actor_lr = params.get('actor_lr', 0.0005)
		self.critic_lr = params.get('critic_lr', 0.001)
		self.gan_critic_lr = params.get('gan_critic_lr', 0.001)
		self.discriminator_lr = params.get('discriminator_lr', 0.0005)
		self.discriminator_batch_size = params.get('discriminator_batch_size', 1)
		self.expert_path = params["expert_path"]
                self.reg_cost = self.params.get('reg_cost', 1e-3)

		## warm start:
		## there is no warm start since there are is no experience replay
		# self.warm_start = params.get('warm_start', 0)

		self.max_turn = params['max_turn'] + 4
		self.state_dimension = 2 * self.act_cardinality + 7 * self.slot_cardinality + 3 + self.max_turn
                self.expert_weights = params['expert_weights']
		# Build models
		self.build_actor_model(self.actor_lr)
		self.build_critic_model(self.critic_lr)
		self.build_critic_model(self.gan_critic_lr, True)
		self.build_discriminator(self.gan_critic_lr)
		self.n = params.get('n', 50)

		## load a model if present
		if params['trained_model_path'] != None:
			self.load(params['trained_actor_model_path'], "actor")
			self.load(params['trained_critic_model_path'], "critic")
			self.load(params['trained_adversarial_critic_model_path'], "advesarial_critic")
			self.load(params['trained_discriminator_model_path'], "discriminator")
			self.predict_mode = True
			self.warm_start = 2
		#self.expert = DQN(self.state_dimension, self.hidden_size, self.hidden_size, self.num_actions)
		self.expert = self.build_expert_model()
		# self.clone_dqn = copy.deepcopy(self.expert)
		# self.clone_dqn = keras.models.clone_model(self.expert)
		self.cur_bellman_err = 0
			
		# Prediction Mode: load trained DQN model
                if params['expert_path'] != None:
		    # self.dqn.model = model_from_json(params['expert_path'])
                    # copy.deepcopy(self.load_trained_DQN(params['expert_path']))
		    # self.dqn.model.load_weights(params['expert_weights'])
		    self.predict_mode = True
		    self.warm_start = 2
                user_sim = RuleSimulator(params['movie_dictionary'], 
                        params['act_set'], 
                        params['slot_set'], 
                        params['goal_set'], 
                        params['usersim_params'])
                self.dialog_manager = DialogManager(self.expert, 
                        user_sim, 
                        params['act_set'], 
                        params['slot_set'], 
                        params['movie_kb'])
                user_sim.set_nlg_model(params['nlg'])
                user_sim.set_nlu_model(params['nlu'])
Example #5
0
user_sim_planning.set_nlg_model(nlg_model)

################################################################################
# load trained NLU model
################################################################################
nlu_model_path = params['nlu_model_path']
nlu_model = nlu()
nlu_model.load_nlu_model(nlu_model_path)

agent.set_nlu_model(nlu_model)
user_sim.set_nlu_model(nlu_model)

################################################################################
# Dialog Manager
################################################################################
dialog_manager = DialogManager(agent, user_sim, user_sim_planning, act_set, slot_set, movie_kb, discriminator)

################################################################################
#   Run num_episodes Conversation Simulations
################################################################################
status = {'successes': 0, 'count': 0, 'cumulative_reward': 0}

simulation_epoch_size = params['simulation_epoch_size']
batch_size = params['batch_size'] # default = 16
warm_start = params['warm_start']
warm_start_epochs = params['warm_start_epochs']
planning_steps = params['planning_steps']

success_rate_threshold = params['success_rate_threshold']
save_check_point = params['save_check_point']
Example #6
0
File: run.py Project: zxsted/DDQ
################################################################################
# load trained NLU model
################################################################################
nlu_model_path = params['nlu_model_path']
nlu_model = nlu()
nlu_model.load_nlu_model(nlu_model_path)

agent.set_nlu_model(nlu_model)
user_sim.set_nlu_model(nlu_model)
world_model.set_nlu_model(nlu_model)

################################################################################
# Dialog Manager
################################################################################
dialog_manager = DialogManager(agent, user_sim, world_model, act_set, slot_set, movie_kb)

################################################################################
#   Run num_episodes Conversation Simulations
################################################################################
status = {'successes': 0, 'count': 0, 'cumulative_reward': 0}

simulation_epoch_size = params['simulation_epoch_size']
batch_size = params['batch_size']  # default = 16
warm_start = params['warm_start']
warm_start_epochs = params['warm_start_epochs']
planning_steps = params['planning_steps']

agent.planning_steps = planning_steps

success_rate_threshold = params['success_rate_threshold']
Example #7
0
            inputtype=params['input'], tr=params['tr'], ts=params['ts'], frac=params['frac'],
            max_req=params['max_req'], upd=params['upd'], name=params['model_name'])
elif agent_type=='nl-rule-hard':
    agent = AgentNLRuleHard(movie_kb, act_set, slot_set, db_inc, corpus_path,
            ts=params['ts'], frac=params['frac'],
            max_req=params['max_req'], upd=params['upd'])
elif agent_type=='nl-rule-soft':
    agent = AgentNLRuleSoft(movie_kb, act_set, slot_set, db_inc, corpus_path,
            tr=params['tr'], ts=params['ts'], frac=params['frac'],
            max_req=params['max_req'], upd=params['upd'])
else:
    agent = AgentNLRuleNoDB(movie_kb, act_set, slot_set, db_inc, corpus_path,
            ts=params['ts'], frac=params['frac'],
            max_req=params['max_req'], upd=params['upd'])

dialog_manager = DialogManager(agent, user_sim, db_full, db_inc, movie_kb, verbose=False)

all_rewards = np.zeros((N,))
all_success = np.zeros((N,))
all_turns = np.zeros((N,))
if save_path is not None: fs = io.open(save_path, 'w')
tst = time.time()

for i in range(N):
    current_reward = 0
    current_success = False
    ua = dialog_manager.initialize_episode()
    utt = ua['nl_sentence']
    if save_path is not None: fs.write(utt+'\n')
    t = 0
    while(True):
# load trained NLG model
################################################################################
agent.set_nlg_model(nlg_model)
user_sim.set_nlg_model(nlg_model)

################################################################################
# load trained NLU model
################################################################################

agent.set_nlu_model(nlu_model)
user_sim.set_nlu_model(nlu_model)

################################################################################
# Dialog Manager
################################################################################
dialog_manager = DialogManager(agent, user_sim, act_set, slot_set, movie_kb,
                               params['is_a2c'])

################################################################################
#   Run num_episodes Conversation Simulations
################################################################################
status = {'successes': 0, 'count': 0, 'cumulative_reward': 0}

simulation_epoch_size = params['simulation_epoch_size']
batch_size = params['batch_size']  # default = 16
warm_start = params['warm_start']
warm_start_epochs = params['warm_start_epochs']

success_rate_threshold = params['success_rate_threshold']
save_check_point = params['save_check_point']
""" Best Model and Performance Records """
best_model = {}
Example #9
0
                                  lr=params['lr'],
                                  N=params['featN'],
                                  tr=params['tr'],
                                  ts=params['ts'],
                                  frac=params['frac'],
                                  max_req=params['max_req'],
                                  upd=params['upd'],
                                  name=params['model_name'],
                                  seq_max_len=params['seq_max_len'])
else:
    print "Invalid agent!"
    sys.exit()

dialog_manager = DialogManager(agent,
                               user_sim,
                               db_full,
                               db_inc,
                               movie_kb,
                               verbose=False)
dialog_manager_eval = DialogManager(agent_eval,
                                    user_sim,
                                    db_full,
                                    db_inc,
                                    movie_kb,
                                    verbose=False)


def eval_agent(ite, max_perf, best=False):
    '''
    自动进行2000次对话评估agent的指标主要评估的是每次对话的平均reward
    :param ite: batch_size * 100 * N
    :param max_perf: 历史最高的测试数据reward平均值
Example #10
0
        else:
            curr_user_goals["all"] = []
            curr_user_goals["all"].extend(copy.deepcopy(train_user_goals))

        # create pretrain user simulator
        pretrain_user_sim = RuleSimulator(mock_dictionary, act_set, slot_set, copy.deepcopy(curr_user_goals), pretrain_usersim_params)
        # create not a pre-trained user simulator
        user_sim = RuleSimulator(mock_dictionary, act_set, slot_set, copy.deepcopy(curr_user_goals), usersim_params)

        # create the pre-trained agent
        pretrained_agent = AgentDQN(kb, act_set, slot_set, pretrained_agent_params)
        # create the agent from scratch
        agent = AgentDQN(kb, act_set, slot_set, agent_params)

        # create dialogue manager for pre-trained agent
        pretrain_dialog_manager = DialogManager(pretrained_agent, pretrain_user_sim, act_set, slot_set, kb)

        # create dialogue manager for not pre-trained agent
        dialog_manager = DialogManager(agent, user_sim, act_set, slot_set, kb)

        # the warmup success rate of the pre-trained model
        pretrain_warmup_succ_rate = 0
        # the warmup success rate of the scratch model
        warmup_succ_rate = 0


        # warm-start the pre-trained agent
        pretrain_warmup_res = warm_start_simulation(pretrain_dialog_manager, pretrained_agent, copy.deepcopy(curr_user_goals["all"]), 2, 8, "pretrain")


        # warm-start the agent from scartch
Example #11
0
uname = raw_input("Please Enter User Name: ").lower()
uid = hash(uname)

cdir = "sessions/"+str(uid)+'_'+datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')+"/"
if not os.path.exists(cdir): os.makedirs(cdir)
f = open(os.path.join(cdir,'credentials'), 'w')
f.write(uname)
f.close()
try:
    for i in range(N):
        print "--------------------------------------------------------------------------------"
        print "Dialog %d" %i
        dia = []
        curr_agent = agent
        dia.append(curr_agent)
        dialog_manager = DialogManager(curr_agent, user_sim, db_full, db_inc, movie_kb, verbose=False)
        utt = dialog_manager.initialize_episode()
        dia.append(copy.deepcopy(utt))
        total_reward = 0
        while(True):
            episode_over, reward, utt, agact = dialog_manager.next_turn()
            dia.append(agact)
            dia.append(copy.deepcopy(utt))
            total_reward += reward
            if episode_over:
                break
        pkl.dump(dia, open(cdir+str(i)+".p",'w'))
except KeyboardInterrupt:
    sys.exit()