Example #1
0
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.asarray([e[1] for e in batch])
               
                target_q = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                
                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = reward[k]
                    else:
                        y_t[k] = reward[k] + gamma*target_q[k]

                loss += critic.model.train_on_batch([states,actions], y_t)
                
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states,a_for_grad)
                actor.update_network(states,grads)

                actor.update_target_network()
                critic.update_target_network()

            total_reward += r
            s_t = s2_t

            if done:
                print 'episode: ' + str(i) + ' reward: ' + str(total_reward) + ' loss: ' + str(loss)
                if total_reward >= best_r:
                    best_r = total_reward
                    actor.model.save_weights('episode'+str(i)+'.hdf5', overwrite=True)
                    #actor.model.save_weights('actor_weight.hdf5', overwrite=True)
                    #critic.model.save_weights('critic_weight.hdf5', overwrite=True)
Example #2
0
class DDPG:
    """docstring for DDPG"""
    def __init__(self, env):
        self.name = 'DDPG'  # name for uploading results
        self.environment = env
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]

        self.sess = tf.Session()

        self.actor_network = ActorNetwork(self.sess, self.state_dim,
                                          self.action_dim)
        self.critic_network = CriticNetwork(self.sess, self.state_dim,
                                            self.action_dim)

        # initialize replay buffer
        self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE)

        # Initialize a random process the Ornstein-Uhlenbeck process for action exploration
        self.exploration_noise = OUNoise(self.action_dim)
        self.sess.run(tf.global_variables_initializer())
        #target_param <- eval_param
        self.actor_network.update_target()
        self.critic_network.update_target()

    def train(self):
        #print "train step",self.time_step
        # Sample a random minibatch of N transitions from replay buffer
        minibatch = self.replay_buffer.sample(BATCH_SIZE)
        state_batch = np.asarray([data[0] for data in minibatch])
        action_batch = np.asarray([data[1] for data in minibatch])
        reward_batch = np.asarray([data[2] for data in minibatch])
        next_state_batch = np.asarray([data[3] for data in minibatch])
        done_batch = np.asarray([data[4] for data in minibatch])

        # for action_dim = 1
        action_batch = np.resize(action_batch, [BATCH_SIZE, self.action_dim])

        # Calculate y_batch

        next_action_batch = self.actor_network.target_actions(next_state_batch)
        q_value_batch = self.critic_network.target_q(next_state_batch,
                                                     next_action_batch)
        y_batch = []
        for i in range(len(minibatch)):
            if done_batch[i]:
                y_batch.append(reward_batch[i])
            else:
                y_batch.append(reward_batch[i] + GAMMA * q_value_batch[i])
        y_batch = np.resize(y_batch, [BATCH_SIZE, 1])
        # Update critic by minimizing the loss L
        self.critic_network.train(y_batch, state_batch, action_batch)

        # Update the actor policy using the sampled gradient:
        action_batch_for_gradients = self.actor_network.actions(state_batch)
        q_gradient_batch = self.critic_network.gradients(
            state_batch, action_batch_for_gradients)

        self.actor_network.train(q_gradient_batch, state_batch)

        # Update the target networks
        self.actor_network.update_target()
        self.critic_network.update_target()

    def noise_action(self, state):
        # Select action a_t according to the current policy and exploration noise
        action = self.actor_network.action(state)
        return action + self.exploration_noise.noise()

    def action(self, state):
        action = self.actor_network.action(state)
        return action

    def perceive(self, state, action, reward, next_state, done):
        # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer
        self.replay_buffer.add(state, action, reward, next_state, done)

        # Store transitions to replay start size then start training
        if self.replay_buffer.size > REPLAY_START_SIZE:
            self.train()

        #if self.time_step % 10000 == 0:
        #self.actor_network.save_network(self.time_step)
        #self.critic_network.save_network(self.time_step)

        # Re-iniitialize the random process when an episode ends
        if done:
            self.exploration_noise.reset()
		# estimate TD-error
		td_error = v_target - critic.model.predict(state_action)[0]

		# check data type
		if type(v_target) is not np.ndarray: 
			v_target = np.asarray([v_target])

		# minibatch 
		BATCH = np.vstack( (BATCH, [st, act, state_action, v_target]) )
		if BATCH.shape[0] > 5: 
			batch = 
		# train critic model based on TD error
		critic.model.fit(state_action, v_target, epochs=1, verbose=0) 

		# estimate action gradients
		grads = critic.gradients(st, act)		
		
		# train actor network based on gradients
		actor.train(st, grads)
	
		# next transition
		state = next_state

		# reset episode
		if done: 
			#print state, action, actor.model.predict(st)[0]
			#if reward == 1: 
			#	print "SUCCESS " + str(critic.model.predict(state_action)[0]) + ' ' + str(v_target)
			#else: 
			#	print "FAILURE " + str(critic.model.predict(state_action)[0]) + ' ' + str(v_target)
			break