Example #1
0
class DDPGAgent:
    def __init__(self,
                 state_size=28,
                 action_size=2,
                 gamma=0.9,
                 learning_rate_actor=0.0001,
                 learning_rate_critic=0.01,
                 tau=0.001,
                 action_max=[1000, 2],
                 batch_size=32):
        self.state_size = state_size
        self.action_size = action_size
        self.action_max = action_max
        self.batch_size = batch_size
        self.memory = deque(maxlen=5000)
        self.gamma = gamma  # discount rate
        self.learning_rate_actor = learning_rate_actor  # learning rate
        self.learning_rate_critic = learning_rate_critic
        self.tau = tau  # target transfer factor
        self.gpu_options = tf.GPUOptions()
        self.config = tf.ConfigProto(gpu_options=self.gpu_options)
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.config)
        K.set_session(self.sess)
        self.actor = Actor(state_size=self.state_size,
                           action_size=self.action_size,
                           learning_rate=self.learning_rate_actor,
                           tau=self.tau,
                           sess=self.sess,
                           batch_size=self.batch_size,
                           action_max=self.action_max)
        self.critic = Critic(state_size=self.state_size,
                             action_size=self.action_size,
                             learning_rate=self.learning_rate_critic,
                             gamma=self.gamma,
                             tau=self.tau,
                             sess=self.sess,
                             batch_size=self.batch_size)
        self.grad_avg = 0
        self.grad_a = []
        self.critic_loss_a = []
        #self.critic_2 = Critic_2(self.state_size, self.action_size, self.learning_rate_critic, self.gamma, self.tau, self.sess)

    def policy_action(self, state):
        '''
        Actor predicts new action
        :param state:
        :return: action
        '''
        return self.actor.predict(state)[0]

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        states = np.asarray([e[0] for e in minibatch])
        actions = np.asarray([e[1] for e in minibatch])
        rewards = np.asarray([e[2] for e in minibatch])
        next_states = np.asarray([e[3] for e in minibatch])

        states = np.asarray(states).reshape(batch_size, self.state_size)
        actions = np.asarray(actions).reshape(batch_size, self.action_size)
        rewards = np.asarray(rewards).reshape(batch_size, 1)
        next_states = np.asarray(next_states).reshape(batch_size,
                                                      self.state_size)
        tar_pre = self.actor.target_predict(next_states)
        Qvals = self.critic.target_predict(next_states, tar_pre)
        Q_primes = rewards + (self.gamma * Qvals)  # Bellman equation
        self.update_models(states, actions, Q_primes)

    def update_models(self, states, actions, critic_target):
        '''
        Update actor and critic networks from sampled experience
        :param states:
        :param actions:
        :param critic_target:
        :return:
        '''
        loss = self.critic.train_on_batch(states, actions,
                                          critic_target)  # Train Critic
        self.critic_loss_a.append(loss)
        # loss = np.sum(-np.log10(loss), axis=0)
        act = self.actor.predict(
            states)  # Q Value Gradient under Current Policy
        grads = self.critic.gradients(states, act)  # actor loss

        self.grad_avg += np.sum(np.log10(np.absolute(grads)),
                                axis=0) / self.batch_size
        self.grad_a = np.append(self.grad_a,
                                np.sum(np.absolute(grads), axis=0) /
                                self.batch_size,
                                axis=0)
        # print('grad_a:', self.grad_a)

        self.actor.train_2(states, grads.reshape(
            (-1, self.action_size)))  # Train actor

        self.actor.transfer_to_actor_model(
        )  # Transfer weights to target networks at rate tau
        self.critic.transfer_to_critic_model()

    def remember(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))

    def save_weights(self, directory, params):
        path_actor = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_actor)
        path_critic = directory + 'Weights' + params + '_LR'.format(
            self.learning_rate_critic)
        self.actor.save(path_actor)
        self.critic.save(path_critic)

    def load_weights(self, path_actor, path_critic):
        self.actor.load_weigths(path_actor)
        self.critic.load_weights(path_critic)

    def load_model(self, path_actor, path_critic):
        self.actor.model.load_model(path_actor)
        self.critic.model.load_model(path_critic)
Example #2
0
File: PPO.py Project: war3gu/gykRL
def train():

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    critic = Critic(env.action_space, env.observation_space)

    actor.load()
    critic.load()

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:

        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll(
            )

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre * 0.7

            state_value = critic.predict(state_b)

            count = 5000 // step

            if count > 500:
                count = 500

            if count < 1:
                count = 1

            count = 10

            for _ in range(count):
                critic.train(state_b, state_pre_value)

            for _ in range(count):
                actor.train(state_b, state_value, state_pre_value,
                            action_matrix_b, prob_b)

            replayMemory.clear()
        ########################

        if done:

            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            ##print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1

            if step % 25 == 0:
                actor.save()
                critic.save()