new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = reward[k] else: y_t[k] = reward[k] + gamma*target_q[k] loss += critic.model.train_on_batch([states,actions], y_t) a_for_grad = actor.model.predict(states) grads = critic.gradients(states,a_for_grad) actor.update_network(states,grads) actor.update_target_network() critic.update_target_network() total_reward += r s_t = s2_t if done: print 'episode: ' + str(i) + ' reward: ' + str(total_reward) + ' loss: ' + str(loss) if total_reward >= best_r: best_r = total_reward actor.model.save_weights('episode'+str(i)+'.hdf5', overwrite=True) #actor.model.save_weights('actor_weight.hdf5', overwrite=True) #critic.model.save_weights('critic_weight.hdf5', overwrite=True)
class DDPG: """docstring for DDPG""" def __init__(self, env): self.name = 'DDPG' # name for uploading results self.environment = env # Randomly initialize actor network and critic network # with both their target networks self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.sess = tf.Session() self.actor_network = ActorNetwork(self.sess, self.state_dim, self.action_dim) self.critic_network = CriticNetwork(self.sess, self.state_dim, self.action_dim) # initialize replay buffer self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) # Initialize a random process the Ornstein-Uhlenbeck process for action exploration self.exploration_noise = OUNoise(self.action_dim) self.sess.run(tf.global_variables_initializer()) #target_param <- eval_param self.actor_network.update_target() self.critic_network.update_target() def train(self): #print "train step",self.time_step # Sample a random minibatch of N transitions from replay buffer minibatch = self.replay_buffer.sample(BATCH_SIZE) state_batch = np.asarray([data[0] for data in minibatch]) action_batch = np.asarray([data[1] for data in minibatch]) reward_batch = np.asarray([data[2] for data in minibatch]) next_state_batch = np.asarray([data[3] for data in minibatch]) done_batch = np.asarray([data[4] for data in minibatch]) # for action_dim = 1 action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim]) # Calculate y_batch next_action_batch = self.actor_network.target_actions(next_state_batch) q_value_batch = self.critic_network.target_q(next_state_batch, next_action_batch) y_batch = [] for i in range(len(minibatch)): if done_batch[i]: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i]) y_batch = np.resize(y_batch, [BATCH_SIZE, 1]) # Update critic by minimizing the loss L self.critic_network.train(y_batch, state_batch, action_batch) # Update the actor policy using the sampled gradient: action_batch_for_gradients = self.actor_network.actions(state_batch) q_gradient_batch = self.critic_network.gradients( state_batch, action_batch_for_gradients) self.actor_network.train(q_gradient_batch, state_batch) # Update the target networks self.actor_network.update_target() self.critic_network.update_target() def noise_action(self, state): # Select action a_t according to the current policy and exploration noise action = self.actor_network.action(state) return action + self.exploration_noise.noise() def action(self, state): action = self.actor_network.action(state) return action def perceive(self, state, action, reward, next_state, done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state, action, reward, next_state, done) # Store transitions to replay start size then start training if self.replay_buffer.size > REPLAY_START_SIZE: self.train() #if self.time_step % 10000 == 0: #self.actor_network.save_network(self.time_step) #self.critic_network.save_network(self.time_step) # Re-iniitialize the random process when an episode ends if done: self.exploration_noise.reset()
# estimate TD-error td_error = v_target - critic.model.predict(state_action)[0] # check data type if type(v_target) is not np.ndarray: v_target = np.asarray([v_target]) # minibatch BATCH = np.vstack( (BATCH, [st, act, state_action, v_target]) ) if BATCH.shape[0] > 5: batch = # train critic model based on TD error critic.model.fit(state_action, v_target, epochs=1, verbose=0) # estimate action gradients grads = critic.gradients(st, act) # train actor network based on gradients actor.train(st, grads) # next transition state = next_state # reset episode if done: #print state, action, actor.model.predict(st)[0] #if reward == 1: # print "SUCCESS " + str(critic.model.predict(state_action)[0]) + ' ' + str(v_target) #else: # print "FAILURE " + str(critic.model.predict(state_action)[0]) + ' ' + str(v_target) break