class DDPG_REC: def __init__(self, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr, critic_lr, gamma, buffer_size, item_space, summary_dir): self.state_item_num = state_item_num self.action_item_num = action_item_num self.emb_dim = emb_dim self.batch_size = batch_size self.tau = tau self.actor_lr = actor_lr self.critic_lr = critic_lr self.gamma = gamma self.buffer_size = buffer_size self.item_space = item_space self.summary_dir = summary_dir self.sess = tf.Session() self.s_dim = emb_dim * state_item_num self.a_dim = emb_dim * action_item_num self.actor = Actor(self.sess, state_item_num, action_item_num, emb_dim, batch_size, tau, actor_lr) self.critic = Critic(self.sess, state_item_num, action_item_num, emb_dim, self.actor.get_num_trainable_vars(), gamma, tau, critic_lr) self.exploration_noise = OUNoise(self.a_dim) # set up summary operators self.summary_ops, self.summary_vars = self.build_summaries() self.sess.run(tf.global_variables_initializer()) self.writer = tf.summary.FileWriter(summary_dir, self.sess.graph) # initialize target network weights self.actor.hard_update_target_network() self.critic.hard_update_target_network() # initialize replay memory self.replay_buffer = ReplayBuffer(buffer_size) def gene_actions(self, weight_batch): """use output of actor network to calculate action list Args: weight_batch: actor network outputs Returns: recommendation list """ item_ids = list(self.item_space.keys()) item_weights = list(self.item_space.values()) max_ids = list() for weight in weight_batch: score = np.dot(item_weights, np.transpose(weight)) idx = np.argmax(score, 0) max_ids.append([item_ids[_] for _ in idx]) return max_ids # def gene_action(self, weight): # """use output of actor network to calculate action list # Args: # weight: actor network outputs # # Returns: # recommendation list # """ # item_ids = list(self.item_space.keys()) # item_weights = list(self.item_space.values()) # score = np.dot(item_weights, np.transpose(weight)) # idx = np.argmax(score) # return item_ids[idx] @staticmethod def build_summaries(): episode_reward = tf.Variable(0.) tf.summary.scalar("reward", episode_reward) episode_max_q = tf.Variable(0.) tf.summary.scalar("max_q_value", episode_max_q) critic_loss = tf.Variable(0.) tf.summary.scalar("critic_loss", critic_loss) summary_vars = [episode_reward, episode_max_q, critic_loss] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars def _train(self): samples = self.replay_buffer.sample_batch(self.batch_size) state_batch = np.asarray([_[0] for _ in samples]) action_batch = np.asarray([_[1] for _ in samples]) reward_batch = np.asarray([_[2] for _ in samples]) n_state_batch = np.asarray([_[3] for _ in samples]) done_batch = np.asarray([_[4] for _ in samples]) seq_len_batch = np.asarray([self.state_item_num] * self.batch_size) # calculate predicted q value action_weights = self.actor.predict_target(state_batch, seq_len_batch) # [batch_size, n_action_batch = self.gene_actions(action_weights.reshape((-1, self.action_item_num, self.emb_dim))) n_action_emb_batch = get_item_emb(n_action_batch, item_ids_emb_dict) target_q_batch = self.critic.predict_target(n_state_batch.reshape((-1, self.s_dim)), n_action_emb_batch.reshape((-1, self.a_dim)), seq_len_batch) y_batch = [] for i in range(self.batch_size): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + self.critic.gamma * target_q_batch[i]) # train critic q_value, critic_loss, _ = self.critic.train(state_batch, action_batch, np.reshape(y_batch, (self.batch_size, 1)), seq_len_batch) # train actor action_weight_batch_for_gradients = self.actor.predict(state_batch, seq_len_batch) action_batch_for_gradients = self.gene_actions(action_weight_batch_for_gradients) action_emb_batch_for_gradients = get_item_emb(action_batch_for_gradients, item_ids_emb_dict) a_gradient_batch = self.critic.action_gradients(state_batch, action_emb_batch_for_gradients.reshape((-1, self.a_dim)), seq_len_batch) self.actor.train(state_batch, a_gradient_batch[0], seq_len_batch) # update target networks self.actor.update_target_network() self.critic.update_target_network() return np.amax(q_value), critic_loss def action(self, state): weight = self.actor.predict(np.reshape(state, [1, self.s_dim]), np.array([self.state_item_num])) + \ self.exploration_noise.noise().reshape( (1, self.action_item_num, int(self.a_dim / self.action_item_num))) action = self.gene_actions(weight) return np.array(action[0]) def perceive_and_train(self, state, action, reward, n_state, done): action_emb = get_item_emb(action, item_ids_emb_dict) self.replay_buffer.add(list(state.reshape((self.s_dim,))), list(action_emb.reshape((self.a_dim,))), [reward], list(n_state.reshape((self.s_dim,))), [done]) # Store transitions to replay start size then start training ep_q_value_, critic_loss = 0, 0 if self.replay_buffer.size() > self.batch_size: ep_q_value_, critic_loss = self._train() # if self.time_step % 10000 == 0: # self.actor_network.save_network(self.time_step) # self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset() return ep_q_value_, critic_loss def write_summary(self, ep_reward, ep_q_value, loss, i): summary_str = self.sess.run(self.summary_ops, feed_dict={self.summary_vars[0]: ep_reward, self.summary_vars[1]: ep_q_value, self.summary_vars[2]: loss}) self.writer.add_summary(summary_str, i) def save(self): self.writer.close() saver = tf.train.Saver() ckpt_path = os.path.join(os.path.dirname(__file__), "models") saver.save(self.sess, ckpt_path, write_meta_graph=False)
class Agent: def __init__(self, experiment, batch_size): self._dummy_env = gym.make(experiment) self._sess = tf.Session() self._sum_writer = tf.summary.FileWriter('logs/', self._sess.graph) # Hardcoded for now self._dim_state = 25 self._dim_goal = 3 self._dim_action = self._dummy_env.action_space.shape[0] self._dim_env = 1 self._batch_size = batch_size # agent noise self._action_noise = OrnsteinUhlenbeckActionNoise( mu=np.zeros(self._dim_action)) self._actor = Actor(self._sess, self._dim_state, self._dim_goal, self._dim_action, self._dummy_env, TAU, LEARNING_RATE, self._batch_size) self._critic = Critic(self._sess, self._dim_state, self._dim_goal, self._dim_action, self._dim_env, self._dummy_env, TAU, LEARNING_RATE, self._actor.get_num_trainable_vars(), self._sum_writer) self._saver = tf.train.Saver(max_to_keep=None) self._sess.run(tf.global_variables_initializer()) self._actor.initialize_target_network() self._critic.initialize_target_network() # training monitoring self._success_rate = tf.Variable(0., name="success_rate") self._python_success_rate = tf.placeholder("float32", []) self._update_success_rate = self._success_rate.assign( self._python_success_rate) self._merged = tf.summary.scalar("successrate", self._update_success_rate) #self._merged = tf.summary.merge(s) #writer = tf.summary.FileWriter('logs/') #writer.add_summary( #writer.add_graph(tf.get_default_graph()) #writer.flush() # def get_dim_state(self): return self._dim_state def get_dim_action(self): return self._dim_action def get_dim_env(self): return self._dim_env def get_dim_goal(self): return self._dim_goal def evaluate_actor(self, actor_predict, obs, goal, history): assert ( history.shape[0] == MAX_STEPS), "history must be of size MAX_STEPS" obs = obs.reshape(1, self._dim_state) goal = goal.reshape(1, self._dim_goal) history = history.reshape(1, history.shape[0], history.shape[1]) return actor_predict(obs, goal, history) def evaluate_actor_batch(self, actor_predict, obs, goal, history): return actor_predict(obs, goal, history) def evaluate_critic(self, critic_predict, obs, action, goal, history, env): obs = obs.reshape(1, self._dim_state) goal = goal.reshape(1, self._dim_goal) action = action.reshape(1, self._dim_action) history = history.reshape(1, history.shape[0], history.shape[1]) env = env.reshape(1, self._dim_env) return critic_predict(env, obs, goal, action, history) def evaluate_critic_batch(self, critic_predict, obs, action, goal, history, env): return critic_predict(env, obs, goal, action, history) def train_critic(self, obs, action, goal, history, env, predicted_q_value): return self._critic.train(env, obs, goal, action, history, predicted_q_value) def train_actor(self, obs, goal, history, a_gradient): return self._actor.train(obs, goal, history, a_gradient) def action_gradients_critic(self, obs, action, goal, history, env): return self._critic.action_gradients(env, obs, goal, action, history) def update_target_actor(self): self._actor.update_target_network() def update_target_critic(self): self._critic.update_target_network() def action_noise(self): return self._action_noise() def update_success(self, success_rate, step): _, result = self._sess.run( [self._update_success_rate, self._merged], feed_dict={self._python_success_rate: success_rate}) self._sum_writer.add_summary(result, step) def save_model(self, filename): self._saver.save(self._sess, filename) def load_model(self, filename): self._saver.restore(self._sess, filename)
class DDPG: def __init__(self, env, batch_size, mem_size, discount, actor_params, critic_params): self._batch_size = batch_size self._mem_size = mem_size self._discount = discount self._sess = tensorflow.Session() k_backend.set_session(self._sess) self._env = env self._state_dim = env.observation_space.shape[0] self._action_dim = env.action_space.shape[0] self._action_min = env.action_space.low self._action_max = env.action_space.high self._state_min = env.observation_space.low self._state_max = env.observation_space.high self._actor = Actor(self._sess, self._state_dim, self._action_dim, self._action_min, self._action_max, actor_params) self._critic = Critic(self._sess, 0.5, self._state_dim, self._action_dim, critic_params) self._memory = ReplayBuffer(mem_size) def get_action(self, state): return self._actor._model.predict(state) def train(self): ''' No training takes place until the replay buffer contains at least batch size number of experiences ''' if (self._memory.size() > self._batch_size): self._train() def _train(self): states, actions, rewards, done, next_states = self._memory.sample( self._batch_size) self._train_critic(states, actions, rewards, done, next_states) action_gradients = self._critic.action_gradients(states, actions) self._actor.train(states, action_gradients) def q_estimate(self, state, action): return self._critic._model.predict(state, action) def _get_q_targets(self, next_states, done, rewards): ''' q = r if done else = r + gamma * qnext ''' # use actor network to determine the next action under current policy # estimate Q values from the critic network actions = self.get_action(next_states) qnext = self.q_estimate(next_states, actions) q_targets = [ reward if end else reward * self._discount * next_q for (reward, next_q, end) in zip(rewards, qnext, done) ] return q_targets def _train_critic(self, states, actions, rewards, done, next_states): q_targets = self._get_q_targets(next_states, done, rewards) self._critic.train(states, actions, q_targets) def experience(self, state, action, reward, done, next_state): # store in replay buffer self._memory.add(state, action, reward, done, next_state) self.train()