Esempio n. 1
0
class DDPG(object):
    def __init__(self, nb_states, nb_actions, args):
        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        net_config = {
            'hidden1' : args.hidden1,
            'hidden2' : args.hidden2
        }

        # Actor and Critic initialization
        self.actor = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_target = Actor(self.nb_states, self.nb_actions, **net_config)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.actor_lr)

        self.critic = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_target = Critic(self.nb_states, self.nb_actions, **net_config)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.critic_lr)

        hard_update(self.critic_target, self.critic)
        hard_update(self.actor_target, self.actor)

        # Replay Buffer and noise
        self.memory = ReplayBuffer(args.memory_size)
        self.noise = OrnsteinUhlenbeckProcess(mu=np.zeros(nb_actions), sigma=float(0.2) * np.ones(nb_actions))

        self.last_state = None
        self.last_action = None

        # Hyper parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount

        # CUDA
        self.use_cuda = args.cuda
        if self.use_cuda:
            self.cuda()

    def cuda(self):
        self.actor.to(device)
        self.actor_target.to(device)
        self.critic.to(device)
        self.critic_target.to(device)

    def eval(self):
        self.actor.eval()
        self.actor_target.eval()
        self.critic.eval()
        self.critic_target.eval()

    def train(self):
        self.actor.train()
        self.actor_target.train()
        self.critic.train()
        self.critic_target.train()

    def reset(self, obs):
        self.last_state = obs
        self.noise.reset()

    def observe(self, reward, state, done):
        self.memory.append([self.last_state, self.last_action, reward, state, done])
        self.last_state = state

    def random_action(self):
        action = np.random.uniform(-1., 1., self.nb_actions)
        self.last_action = action
        return action.argmax() if self.discrete else action

    def select_action(self, state, apply_noise=False):
        self.eval()
        action = to_numpy(self.actor(to_tensor(np.array([state]), device=device))).squeeze(0)
        self.train()
        if apply_noise:
            action = action + self.noise.sample()
        action = np.clip(action, -1., 1.)
        self.last_action = action
        #print('action:', action, 'output:', action.argmax())
        return action.argmax() if self.discrete else action

    def update_policy(self):
        state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = self.memory.sample_batch(self.batch_size)
        state = to_tensor(np.array(state_batch), device=device)
        action = to_tensor(np.array(action_batch), device=device)
        next_state = to_tensor(np.array(next_state_batch), device=device)

        # compute target Q value
        next_q_value = self.critic_target([next_state, self.actor_target(next_state)])
        target_q_value = to_tensor(reward_batch, device=device) \
                         + self.discount * to_tensor((1 - terminal_batch.astype(np.float)), device=device) * next_q_value

        # Critic and Actor update
        self.critic.zero_grad()
        with torch.set_grad_enabled(True):
            q_values = self.critic([state, action])
            critic_loss = criterion(q_values, target_q_value.detach())
            critic_loss.backward()
            self.critic_optim.step()

        self.actor.zero_grad()
        with torch.set_grad_enabled(True):
            policy_loss = -self.critic([state.detach(), self.actor(state)]).mean()
            policy_loss.backward()
            self.actor_optim.step()

        # Target update
        soft_update(self.actor_target, self.actor, self.tau)
        soft_update(self.critic_target, self.critic, self.tau)

        return to_numpy(-policy_loss), to_numpy(critic_loss), to_numpy(q_values.mean())

    def save_model(self, output, num=1):
        if self.use_cuda:
            self.actor.to(torch.device("cpu"))
            self.critic.to(torch.device("cpu"))
        torch.save(self.actor.state_dict(), '{}/actor{}.pkl'.format(output, num))
        torch.save(self.critic.state_dict(), '{}/critic{}.pkl'.format(output, num))
        if self.use_cuda:
            self.actor.to(device)
            self.critic.to(device)

    def load_model(self, output, num=1):
        self.actor.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.actor_target.load_state_dict(torch.load('{}/actor{}.pkl'.format(output, num)))
        self.critic.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        self.critic_target.load_state_dict(torch.load('{}/critic{}.pkl'.format(output, num)))
        if self.use_cuda:
            self.cuda()
def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.n)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    # Store q values for illustration purposes
    q_max_array = []
    reward_array = []

    for i in range(MAX_EPISODES):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(MAX_EP_STEPS):

            # if i % 40 == 0 and i > 1:
            #     env.render()

            # Begin "Experimentation and Evaluation Phase"

            # Seleect next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))

            # If in a testing episode, do not add noise
            # if i%100 is not 49 and i%100 is not 99:
            noise = exploration_noise.noise()
            a = a + noise

            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)
            # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,)))

            # Add transition to replay buffer if not testing episode
            # if i%100 is not 49 and i%100 is not 99:
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > MINIBATCH_SIZE:
                s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                    MINIBATCH_SIZE)

                # Find target estimate to use for updating the Q-function

                # Predict_traget function determines Q-value of next state
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                y_i = []
                for k in range(MINIBATCH_SIZE):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + GAMMA * target_q[k])

                # Perform gradient descent to update critic
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:
                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))
                #reward_array.append(ep_reward)
                break

        ep_reward = 0
        s = env.reset()

        for j in range(MAX_EP_STEPS):
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            # Take step with experimental action
            action = np.argmax(a)
            s2, r, terminal, info = env.step(action)

            ep_reward += r
            s = s2

            if terminal:
                print('Normal | Reward: %.2i' % int(ep_reward), " | Episode",
                      i)
                reward_array.append(ep_reward)
                break

    # Max Q plot
    plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-')
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.savefig('Q.png')
    plt.show()

    # Reward plot
    plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-')
    plt.xlabel('Episode Number')
    plt.ylabel('Reward')
    plt.savefig('Reward.png')
    plt.show()
    save_result([[str(i[0]) for i in q_max_array],
                 [str(i) for i in reward_array]])
Esempio n. 3
0
def train(sess, env, actor, critic, RESTORE):

    sess.run(tf.global_variables_initializer())

    # Initialize random noise generator
    exploration_noise = OUNoise(env.action_space.shape[0])

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay buffER
    replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)

    totSteps = 0

    # Store q values for illustration purposes
    q_max_array = []

    actor.learning_rate = MAX_ACTOR_LEARNING_RATE
    critic.learning_rate = MAX_CRITIC_LEARNING_RATE

    for i in xrange(MAX_EPISODES):

        s = env.reset()
        s = normalize(s)

        ep_reward = 0
        ep_ave_max_q = 0

        # update learning rates using cosine annealing
        T_cur = i % LR_CYCLE
        actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\
                              0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\
                              0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \
                              (1 + np.cos(np.pi * T_cur / LR_CYCLE))

        for j in xrange(MAX_EP_STEPS):

            totSteps += 1

            # Begin "Experimentation and Evaluation Phase"

            # Select next experimental action by adding noise to action prescribed by policy
            a = actor.predict(np.reshape(s, (1, actor.s_dim, 1)))

            # If in a testing episode, do not add noise
            if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99):
                noise = exploration_noise.noise()
                a = a + noise

            # Constrain action
            a = np.clip(a, -15, 15)

            # Take step with experimental action
            s2, r, terminal, info = env.step(
                np.reshape(a.T, newshape=(env.action_space.shape[0], )),
                CONST_THROTTLE)

            #print("car pos: " + str(env.car_dist_s))
            #print("action: " + str(a))
            #print("reward: " + str(r))

            s2 = normalize(s2)

            # Add transition to replay buffer if not testing episode
            if i % 100 is not 49 and i % 100 is not 99:
                replay_buffer.add(np.reshape(s, (actor.s_dim, 1)),
                                  np.reshape(a, (actor.a_dim, )), r, terminal,
                                  np.reshape(s2, (actor.s_dim, 1)))

                # Keep adding experience to the memory until
                # there are at least minibatch size samples
                if replay_buffer.size() > MEMORY_WARMUP:
                    s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
                        MINIBATCH_SIZE)

                    # Find target estimate to use for updating the Q-function

                    # Predict_traget function determines Q-value of next state
                    target_q = critic.predict_target(
                        s2_batch, actor.predict_target(s2_batch))

                    # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1)))
                    y_i = []
                    for k in xrange(MINIBATCH_SIZE):
                        if t_batch[k]:
                            y_i.append(r_batch[k])
                        else:
                            y_i.append(r_batch[k] + GAMMA * target_q[k])

                    # Perform gradient descent to update critic
                    predicted_q_value, _ = critic.train(
                        s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1)))

                    ep_ave_max_q += np.amax(predicted_q_value, axis=0)

                    # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient
                    a_outs = actor.predict(s_batch)
                    grads = critic.action_gradients(s_batch, a_outs)
                    actor.train(s_batch, grads[0])

                    # Update target networks
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            # If episode is finished, print results
            if terminal:

                if i % 100 is 49 or i % 100 is 99:
                    print("Testing")

                    kmodel = Sequential()
                    actVars = []
                    for var in tf.trainable_variables():
                        if 'non-target' in str(var):
                            actVars.append(var)

                    kmodel.add(
                        Dense(units=l1size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[0]),
                                  sess.run(actVars[1])
                              ],
                              input_dim=actor.s_dim))
                    kmodel.add(
                        Dense(units=l2size,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[2]),
                                  sess.run(actVars[3])
                              ]))
                    kmodel.add(
                        Dense(units=1,
                              activation='tanh',
                              weights=[
                                  sess.run(actVars[4]),
                                  sess.run(actVars[5])
                              ]))
                    optimizer = optimizers.RMSprop(lr=0.00025,
                                                   rho=0.9,
                                                   epsilon=1e-06)
                    kmodel.compile(loss="mse", optimizer=optimizer)
                    kmodel.save(modelfile)

                else:
                    print("Training")

                print('| Reward: %.2i' % int(ep_reward), " | Episode", i,
                      '| Qmax: %.4f' % (ep_ave_max_q / float(j)))
                q_max_array.append(ep_ave_max_q / float(j))

                print('Finished in ' + str(j) + ' steps')

                break

    plt.plot(q_max_array)
    plt.xlabel('Episode Number')
    plt.ylabel('Max Q-Value')
    plt.show()

    kmodel = Sequential()
    actVars = []
    for var in tf.trainable_variables():
        if 'non-target' in str(var):
            actVars.append(var)

    kmodel.add(
        Dense(units=l1size,
              activation='tanh',
              weights=[sess.run(actVars[0]),
                       sess.run(actVars[1])],
              input_dim=actor.s_dim))
    kmodel.add(
        Dense(units=l2size,
              activation='tanh',
              weights=[sess.run(actVars[2]),
                       sess.run(actVars[3])]))
    kmodel.add(
        Dense(units=1,
              activation='tanh',
              weights=[sess.run(actVars[4]),
                       sess.run(actVars[5])]))
    optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06)
    kmodel.compile(loss="mse", optimizer=optimizer)
    kmodel.summary()
    kmodel.save(modelfile)