Example #1
0
def train(sess, env, args, actor, critic, actor_noise, reward_result,
          lambda_mix):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    [A, B] = get_linear_dynamics()
    prior = BasePrior(A, B)
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    paths = list()

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, rewards = [], [], []

        #Get optimal reward using optimal control
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a_prior = prior.getControl_h(s0)
            a_prior = np.squeeze(np.asarray(a_prior))
            a = a_prior
            s0, r, stop_c, _ = env.step(a)
            ep_reward_opt += r
            if (stop_c):
                break

        env.reset()
        sp = env.unwrapped.reset(s)

        reward_lqr = 0.
        while True:
            a_lqr = prior.getControl(sp)
            a_lqr = np.squeeze(np.asarray(a_lqr))
            sp, reward_p, done_p, _ = env.step(a_lqr)
            reward_lqr += reward_p
            if done_p:
                break

        # Get reward using regRL algorithm
        env.reset()
        s = env.unwrapped.reset(s)

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            # lambda_mix = 5.

            # Prior control
            a_prior = prior.getControl_h(s)
            a_prior = np.squeeze(np.asarray(a_prior))

            # Rl control with exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i))

            # Mix the actions (RL controller + control prior)
            act = a[0] / (1 + lambda_mix) + (lambda_mix /
                                             (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, info = env.step(act)

            # Add info from time step to the replay buffer
            replay_buffer.add(
                np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )),
                r, terminal, np.reshape(s2, (actor.s_dim, )),
                np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior,
                           (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

                # Calculate TD-Error for each state
                base_q = critic.predict_target(s_batch,
                                               actor.predict_target(s_batch))
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(a[0])

            # Collect results at end of episode
            if terminal:
                for ii in range(len(obs)):
                    obs[ii] = obs[ii].reshape((4, 1))
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                reward_result[2, i] = reward_lqr
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 4)),
                    "Action": np.concatenate(action),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)
                break

    return [summary_ops, summary_vars, paths]
Example #2
0
        sp = np.copy(s0)
        reward_prior = 0.
        while True:
            a_prior = prior.getControl_h(sp)
            a_prior = np.squeeze(np.asarray(a_prior))
            sp, reward_p, done_p, _ = env.step(a_prior)
            reward_prior += reward_p
            if done_p:
                break

        env.reset()
        sp = env.unwrapped.reset(s0)
            
        reward_lqr = 0.
        while True:
            a_lqr = prior.getControl(sp)
            a_lqr = np.squeeze(np.asarray(a_lqr))
            sp, reward_p, done_p, _ = env.step(a_lqr)
            reward_lqr += reward_p
            if done_p:
                break
            
        env.reset()
        s = env.unwrapped.reset(s0)
        #s = env.reset()
        ep_r, ep_t, ep_a = 0, 0, []

        while True:
            a, v = ppo.evaluate_state(s)
            a = np.squeeze(a)
            s = np.squeeze(s)[np.newaxis,:]