def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()
class DDPGAgent: def __init__(self, state_dim, action_dim, action_max): # load model if True self.load_model = False tf.reset_default_graph() self.sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) # information of state and action self.state_dim = state_dim self.action_dim = action_dim self.action_max = float(action_max) self.action_min = -float(action_max) # hyper parameters self.h_critic = 16 self.h_actor = 16 self.lr_critic = 1e-3 self.lr_actor = 1e-4 self.discount_factor = 0.99 self.tau = 0.01 # soft target update rate self.state_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim]) self.reward_ph = tf.placeholder(dtype=tf.float32, shape=[None]) self.next_state_ph = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim]) self.done_ph = tf.placeholder(dtype=tf.float32, shape=[None]) with tf.variable_scope('actor'): self.action = self.generate_actor_network(self.state_ph, True) with tf.variable_scope('target_actor'): self.target_action = self.generate_actor_network( self.next_state_ph, False) with tf.variable_scope('critic'): self.qvalue = self.generate_critic_network(self.state_ph, self.action, True) with tf.variable_scope('target_critic'): self.target_qvalue = self.generate_critic_network( self.next_state_ph, self.target_action, False) self.a_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='actor') self.ta_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') self.c_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic') self.tc_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') q_target = tf.expand_dims( self.reward_ph, 1) + self.discount_factor * self.target_qvalue * ( 1 - tf.expand_dims(self.done_ph, 1)) td_errors = q_target - self.qvalue critic_loss = tf.reduce_mean(tf.square(td_errors)) self.train_critic = tf.train.AdamOptimizer(self.lr_critic).minimize( critic_loss, var_list=self.c_params) actor_loss = -tf.reduce_mean(self.qvalue) self.train_actor = tf.train.AdamOptimizer(self.lr_actor).minimize( actor_loss, var_list=self.a_params) self.soft_target_update = [[ tf.assign(ta, (1 - self.tau) * ta + self.tau * a), tf.assign(tc, (1 - self.tau) * tc + self.tau * c) ] for a, ta, c, tc in zip(self.a_params, self.ta_params, self.c_params, self.tc_params)] # exploration self.epsilon = 1. self.epsilon_start, self.epsilon_end = 1.0, 0 self.exploration_steps = 100000. self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) / self.exploration_steps self.noise = np.zeros(action_dim) self.minibatch_size = 32 self.pre_train_step = 3 self.replay_buffer = ReplayBuffer(buffer_size=1000000, minibatch_size=self.minibatch_size) self.mu = 0 self.theta = 0.15 self.sigma = 0.2 # tensorboard setting self.avg_q_max, self.loss_sum = 0, 0 self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/simple_ddpg', self.sess.graph) self.sess.run(tf.global_variables_initializer()) self.save_file = "./save_model/tensorflow_ddpg-1" self.load_file = "./save_model/tensorflow_ddpg-1" self.saver = tf.train.Saver() if self.load_model: self.saver.restore(self.sess, self.load_file) def choose_action(self, state): return self.sess.run(self.action, feed_dict={self.state_ph: state[None]})[0] def train_network(self, state, action, reward, next_state, done, step): self.sess.run(self.train_critic, feed_dict={ self.state_ph: state, self.action: action, self.reward_ph: reward, self.next_state_ph: next_state, self.done_ph: done }) self.sess.run(self.train_actor, feed_dict={self.state_ph: state}) self.sess.run(self.soft_target_update) def generate_critic_network(self, state, action, trainable): hidden1 = tf.layers.dense(tf.concat([state, action], axis=1), self.h_critic, activation=tf.nn.relu, trainable=trainable) hidden2 = tf.layers.dense(hidden1, self.h_critic, activation=tf.nn.relu, trainable=trainable) hidden3 = tf.layers.dense(hidden2, self.h_critic, activation=tf.nn.relu, trainable=trainable) qvalue = tf.layers.dense(hidden3, 1, trainable=trainable) return qvalue def generate_actor_network(self, state, trainable): hidden1 = tf.layers.dense(state, self.h_actor, activation=tf.nn.relu, trainable=trainable) hidden2 = tf.layers.dense(hidden1, self.h_actor, activation=tf.nn.relu, trainable=trainable) hidden3 = tf.layers.dense(hidden2, self.h_actor, activation=tf.nn.relu, trainable=trainable) non_scaled_action = tf.layers.dense(hidden3, self.action_dim, activation=tf.nn.sigmoid, trainable=trainable) action = non_scaled_action * (self.action_max - self.action_min) + self.action_min return action def get_action(self, obs): # 최적의 액션 선택 + Exploration (Epsilon greedy) action = self.choose_action(obs) self.printConsole("origianl action: " + str(action)) if self.epsilon > self.epsilon_end: self.epsilon -= self.epsilon_decay_step self.printConsole("noise scale: " + str(self.epsilon)) self.noise = self.ou_noise(self.noise) self.printConsole(" noise: " + str(self.noise * (self.action_max - self.action_min) / 2 * max(self.epsilon, 0))) action = action + self.noise * ( self.action_max - self.action_min) / 2 * max(self.epsilon, 0) action = np.maximum(action, self.action_min) action = np.minimum(action, self.action_max) return action def train_agent(self, obs, action, reward, obs_next, done, step): self.replay_buffer.add_to_memory((obs, action, reward, obs_next, done)) if len(self.replay_buffer.replay_memory ) < self.minibatch_size * self.pre_train_step: return None minibatch = self.replay_buffer.sample_from_memory() s, a, r, ns, d = map(np.array, zip(*minibatch)) self.train_network(s, a, r, ns, d, step) return None # make summary operators for tensorboard def setup_summary(self): episode_total_reward = tf.Variable(0.) episode_avg_max_q = tf.Variable(0.) episode_avg_loss = tf.Variable(0.) episode_total_score = tf.Variable(0.) tf.summary.scalar('Total Reward/Episode', episode_total_reward) tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q) tf.summary.scalar('Average Loss/Episode', episode_avg_loss) tf.summary.scalar('Total Score/Episode', episode_total_score) summary_vars = [ episode_total_reward, episode_avg_max_q, episode_avg_loss, episode_total_score ] summary_placeholders = [ tf.placeholder(tf.float32) for _ in range(len(summary_vars)) ] update_ops = [ summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars)) ] summary_op = tf.summary.merge_all() return summary_placeholders, update_ops, summary_op def ou_noise(self, x): return x + self.theta * (self.mu - x) + self.sigma * np.random.randn( self.action_dim) def printConsole(self, message): print(message) sys.__stdout__.flush()