def main(self): np.random.seed(0) replay_memory = deque(maxlen=REPLAY_MEM_CAPACITY) def add_to_memory(experience): replay_memory.append(experience) def sample_from_memory(minibatch_size): return random.sample(replay_memory, minibatch_size) tf.reset_default_graph() # placeholders state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM]) action_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, ACTION_DIM]) reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None]) next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM]) # indicators (go into target computation) is_not_terminal_placeholder = tf.placeholder(dtype=tf.float32, shape=[None]) is_training_placeholder = tf.placeholder(dtype=tf.bool, shape=()) # for dropout # episode counter episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) # actor network with tf.variable_scope('actor'): actor = Actor(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) ''' Policy's outputted action for each state_ph (for generating actions and training the critic) ''' actions_unscaled = actor.call(state_placeholder) actions = MIN_BANDWIDTH + tf.nn.sigmoid(actions_unscaled) * ( MAX_BANDWIDTH - MIN_BANDWIDTH) # slow target actor network with tf.variable_scope('target_actor', reuse=False): target_actor = Actor(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) ''' Slow target policy's outputted action for each next_state_ph (for training the critic) use stop_gradient to treat the output values as constant targets when doing backprop ''' target_next_actions_unscaled = target_actor.call( next_state_placeholder) target_next_actions_1 = MIN_BANDWIDTH + tf.nn.sigmoid(\ target_next_actions_unscaled)*(MAX_BANDWIDTH - MIN_BANDWIDTH) target_next_actions = tf.stop_gradient(target_next_actions_1) with tf.variable_scope('critic') as scope: critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) # Critic applied to state_ph and a given action(for training critic) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) ''' Critic applied to state_ph and the current policy's outputted actions for state_ph (for training actor via deterministic policy gradient) ''' q_values_of_suggested_actions = critic.call( state_placeholder, actions) # slow target critic network with tf.variable_scope('target_critic', reuse=False): target_critic = Critic(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, \ trainable=True) ''' Slow target critic applied to slow target actor's outputted actions for next_state_ph (for training critic) ''' q_values_next = tf.stop_gradient( target_critic.call(next_state_placeholder, target_next_actions)) # isolate vars for each network actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') # update slowly-changing targets towards current actor and critic update_target_ops = [] for i, target_actor_var in enumerate(target_actor_vars): update_target_actor_op = target_actor_var.assign( TAU * actor_vars[i] + (1 - TAU) * target_actor_var) update_target_ops.append(update_target_actor_op) for i, target_var in enumerate(target_critic_vars): target_critic_op = target_var.assign(TAU * critic_vars[i] + (1 - TAU) * target_var) update_target_ops.append(target_critic_op) update_targets_op = tf.group(*update_target_ops, name='update_slow_targets') ''' # One step TD targets y_i for (s,a) from experience replay # = r_i + gamma*Q_slow(s',mu_slow(s')) if s' is not terminal # = r_i if s' terminal ''' targets = tf.expand_dims( reward_placeholder, 1) + tf.expand_dims(is_not_terminal_placeholder,\ 1) * GAMMA * q_values_next # 1-step temporal difference errors td_errors = targets - q_values_of_given_actions # critic loss function (mean-square value error with regularization) critic_loss = tf.reduce_mean(tf.square(td_errors)) for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # critic optimizer critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) # actor loss function (mean Q-values under current policy with # regularization) actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) ''' actor optimizer the gradient of the mean Q-values wrt actor params is the deterministic policy gradient (keeping critic params fixed) ''' actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR*LR_DECAY**episodes).minimize(actor_loss, \ var_list=actor_vars) # initialize session sess = tf.Session() sess.run(tf.global_variables_initializer()) # print(sess.run(tf.report_uninitialized_variables())) ## Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (MAX_BANDWIDTH - MIN_BANDWIDTH) # TODO: uses env # Initial state self.reset() # TODO: uses env state = self.input_state for t in range(MAX_STEPS_PER_EPISODE): # choose action based on deterministic policy state = np.asarray(state) state = state.reshape(1, state.shape[0]) action, = sess.run(actions, feed_dict={state_placeholder: state, \ is_training_placeholder: False}) # add temporally-correlated exploration noise to action # (using an Ornstein-Uhlenbeck process) noise = EXPLORATION_THETA * \ (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA*np.random.randn(ACTION_DIM) action += noise_scale * noise # take step next_state, reward, done, = self.step(action) total_reward += reward add_to_memory(( state, action, reward, next_state, # is next_observation a terminal state? # 0.0 if done and not env.env._past_limit() else # 1.0)) 0.0 if done else 1.0)) # update network weights to fit a minibatch of experience if num_steps % TRAIN_EVERY == 0 and \ len(replay_memory) >= MINI_BATCH_SIZE: minibatch = sample_from_memory(MINI_BATCH_SIZE) ''' update the critic and actor params using mean-square value error and deterministic policy gradient, respectively ''' _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray([elem[0] for elem in \ minibatch]), action_placeholder: np.asarray([elem[1] for elem in \ minibatch]), reward_placeholder: np.asarray([elem[2] for elem in \ minibatch]), next_state_placeholder: np.asarray([elem[3] for elem in\ minibatch]), is_not_terminal_placeholder: np.asarray([elem[4] for \ elem in minibatch]), is_training_placeholder: True}) ''' update slow actor and critic targets towards current actor and critic ''' _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: # Increment episode counter _ = sess.run(episode_incr_op) break print('Episode %2i, Reward: %7.3f, Steps: %i, Final noise scale: \ %7.3f' % (episode, total_reward, num_steps_in_episode, \ noise_scale))
def main(): ''' Create the environment ''' env = gym.make(ENV_NAME) # For tensorboard writer = tf.summary.FileWriter("./tensorboard") assert STATE_DIM == np.prod(np.array(env.observation_space.shape)) assert ACTION_DIM == np.prod(np.array(env.action_space.shape)) env.seed(0) np.random.seed(0) ''' Create the replay memory ''' replay_memory = Memory(REPLAY_MEM_CAPACITY) # Tensorflow part starts here! tf.reset_default_graph() ''' Create placeholders ''' # Placeholders state_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, STATE_DIM], name='state_placeholder') action_placeholder = tf.placeholder(dtype=tf.float32, \ shape=[None, ACTION_DIM], name='action_placeholder') reward_placeholder = tf.placeholder(dtype=tf.float32, shape=[None], name='reward_placeholder') next_state_placeholder = tf.placeholder(dtype=tf.float32, shape=[None, STATE_DIM], name='next_state_placeholder') is_not_terminal_placeholder = tf.placeholder( dtype=tf.float32, shape=[None], name='is_not_terminal_placeholder') is_training_placeholder = tf.placeholder(dtype=tf.float32, shape=(), name='is_training_placeholder') ''' A counter to count the number of episodes ''' episodes = tf.Variable(0.0, trainable=False, name='episodes') episode_incr_op = episodes.assign_add(1) ''' Create the actor network inside the actor scope and calculate actions ''' with tf.variable_scope('actor'): actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_actions = actor.call(state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' actions = scale_actions(unscaled_actions, env.action_space.low, env.action_space.high) ''' Create the target actor network inside target_actor scope and calculate the target actions. Apply stop_gradient to the target actions so that thier gradient is not computed at any point of time. ''' with tf.variable_scope('target_actor', reuse=False): target_actor = ActorNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_ACTOR, HIDDEN_2_ACTOR, HIDDEN_3_ACTOR, trainable=True) unscaled_target_actions = target_actor.call(next_state_placeholder) ''' Scale the actions to fit within the bounds provided by the environment ''' target_actions_temp = scale_actions(unscaled_target_actions, env.action_space.low, env.action_space.low) target_actions = tf.stop_gradient(target_actions_temp) ''' Create the critic network inside the critic variable scope. Get the Q-values of given actions and Q-values of actions suggested by the actor network. ''' with tf.variable_scope('critic'): critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) q_values_of_given_actions = critic.call(state_placeholder, action_placeholder) q_values_of_suggested_actions = critic.call(state_placeholder, actions) ''' Create the target critic network inside the target_critic variable scope. Calculate the target Q-values and apply stop_gradient to it. ''' with tf.variable_scope('target_critic', reuse=False): target_critic = CriticNetwork(STATE_DIM, ACTION_DIM, HIDDEN_1_CRITIC, HIDDEN_2_CRITIC, HIDDEN_3_CRITIC, trainable=True) target_q_values_temp = target_critic.call(next_state_placeholder, target_actions) target_q_values = tf.stop_gradient(target_q_values_temp) ''' Calculate - trainable variables in actor (Weights of actor network), - Weights of target actor network - trainable variables in critic (Weights of critic network), - Weights of target critic network ''' actor_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='actor') target_actor_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_actor') critic_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='critic') target_critic_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_critic') ''' Get the operators for updating the target networks. The update_target_networks function defined in utils returns a list of operators to be run from tf session inorder to update the target networks using soft update. ''' update_targets_op = update_target_networks(TAU, \ target_actor_vars, actor_vars, target_critic_vars, \ critic_vars) ''' Create the tf operation to train the critic network: - calculate TD-target - calculate TD-Error = TD-target - q_values_of_given_actions - calculate Critic network's loss (Mean Squared Error of TD-Errors) - ? - create a tf operation to train the critic network ''' targets = tf.expand_dims(reward_placeholder, 1) + \ tf.expand_dims(is_not_terminal_placeholder, 1) * GAMMA * \ target_q_values td_errors = targets - q_values_of_given_actions critic_loss = tf.reduce_mean(tf.square(td_errors)) # Update critic networks after computing loss for var in critic_vars: if not 'bias' in var.name: critic_loss += L2_REG_CRITIC * 0.5 * tf.nn.l2_loss(var) # optimize critic critic_train_op = tf.train.AdamOptimizer( LEARNING_RATE_CRITIC * LR_DECAY**episodes).minimize(critic_loss) ''' Create a tf operation to train the actor networks - Calculate the Actor network's loss - Create the tf operation to train the actor network ''' # Actor's loss actor_loss = -1 * tf.reduce_mean(q_values_of_suggested_actions) for var in actor_vars: if not 'bias' in var.name: actor_loss += L2_REG_ACTOR * 0.5 * tf.nn.l2_loss(var) # Optimize actor actor_train_op = tf.train.AdamOptimizer( LEARNING_RATE_ACTOR * LR_DECAY**episodes).minimize(actor_loss, var_list=actor_vars) # Init session sess = tf.Session() sess.run(tf.global_variables_initializer()) writer.add_graph(sess.graph) # Training num_steps = 0 for episode in range(NUM_EPISODES): total_reward = 0 num_steps_in_episode = 0 # Create noise noise = np.zeros(ACTION_DIM) noise_scale = (INITIAL_NOISE_SCALE * NOISE_DECAY ** episode) * \ (env.action_space.high - env.action_space.low) # Initial state state = env.reset() for _ in range(MAX_STEPS_PER_EPISODE): action = sess.run(actions, feed_dict={ \ state_placeholder: state[None], is_training_placeholder: False}) # Add Noise to actions noise = EXPLORATION_THETA * (EXPLORATION_MU - noise) + \ EXPLORATION_SIGMA * np.random.randn(ACTION_DIM) action += noise_scale * noise # Take action on env next_state, reward, done, _info = env.step(action) next_state = np.squeeze(next_state) reward = np.squeeze(reward) action = action[0] total_reward += reward replay_memory.add_to_memory( (state, action, reward, next_state, 0.0 if done else 1.0)) if num_steps % TRAIN_EVERY == 0 and replay_memory.size() >= \ MINI_BATCH_SIZE : batch = replay_memory.sample_from_memory(MINI_BATCH_SIZE) _, _ = sess.run([critic_train_op, actor_train_op], feed_dict={ state_placeholder: np.asarray( \ [elem[0] for elem in batch]), action_placeholder: np.asarray( \ [elem[1] for elem in batch]), reward_placeholder: np.asarray( \ [elem[2] for elem in batch]), next_state_placeholder: np.asarray( \ [elem[3] for elem in batch]), is_not_terminal_placeholder: np.asarray( \ [elem[4] for elem in batch]), is_training_placeholder: True }) _ = sess.run(update_targets_op) state = next_state num_steps += 1 num_steps_in_episode += 1 if done: _ = sess.run(episode_incr_op) break print(str((episode, total_reward, num_steps_in_episode, noise_scale))) env.close()