Exemple #1
0
    def __init__(self):
        dat = loadmat('car_data_formatted_arc')
        x = np.copy(np.squeeze(dat['car_dat']))
        aaa = np.arange(len(x))
        random.shuffle(aaa)
        self.data = x[aaa]
        y = np.copy(np.squeeze(dat['car_dat']))
        self.data_orig = y[aaa]
        self.count = 0
        self.episode = -1
        self.L = 100
        self.numCars = 5
        self.dt = 0.1
        self.collision_flag = 0
        
        self.state = np.copy(np.squeeze(self.data[0][0]))
        self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12]))

        self.prior = BasePrior()

        
        self.action_space = spaces.Box(low=-7.0, high = 3.0, shape = (1,))

        high = np.array([
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max,
            np.finfo(np.float32).max])
        self.observation_space = spaces.Box(-high, high)
Exemple #2
0
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess
        [A, B] = get_linear_dynamics()
        self.prior = BasePrior(A, B)

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess, self.prior)
Exemple #3
0
    def __init__(self):
        dat = loadmat('car_data_formatted_arc')
        x = np.copy(np.squeeze(dat['car_dat']))
        aaa = np.arange(len(x))
        random.shuffle(aaa)
        self.data = x[aaa]
        y = np.copy(np.squeeze(dat['car_dat']))
        self.data_orig = y[aaa]
        self.count = 0
        self.episode = -1
        self.L = 100
        self.numCars = 5
        self.dt = 0.1
        self.collision_flag = 0

        self.state = np.copy(np.squeeze(self.data[0][0]))
        self.bot_state = np.copy(np.squeeze(self.data[0][0][9:12]))

        self.prior = BasePrior()
Exemple #4
0
    def train(self, replay_buffer, minibatch_size):
        # Get dynamics and initialize prior controller
        prior = BasePrior()

        #self.sess.as_default()

        # Needed to enable BatchNorm
        #tflearn.is_training(True)

        #Sample a batch from the replay buffer
        s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch(
            minibatch_size)

        # Calculate targets
        target_q = self.critic.predict_target(
            s2_batch, self.actor.predict_target(s2_batch))
        y_i = []
        for k in range(minibatch_size):
            if t_batch[k]:
                y_i.append(r_batch[k])
            else:
                y_i.append(r_batch[k] + self.critic.gamma * target_q[k])

        # Update the critic given the targets
        predicted_q_value, _ = self.critic.train(
            s_batch, a_batch, np.reshape(y_i, (minibatch_size, 1)))
        ep_ave_max_q += np.amax(predicted_q_value)

        # Update the actor policy using the sampled gradient
        a_outs = self.actor.predict(s_batch)
        grads = self.critic.action_gradients(s_batch, a_outs)
        self.actor.train(s_batch, grads[0])

        # Update target networks
        self.actor.update_target_network()
        self.critic.update_target_network()
Exemple #5
0
def train(sess, env, args, actor, critic, actor_noise, reward_result):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    prior = BasePrior()

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    tflearn.is_training(True)

    paths = list()

    lambda_store = np.zeros((int(args['max_episode_len']), 1))

    for i in range(int(args['max_episodes'])):

        s = env.reset_inc()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, act_prior, rewards, obs_ref, prior_ref, collisions = [], [], [], [], [], [], []

        #Get reward using baseline controller
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a = env.getPrior()
            prior_ref.append(np.array([a]))
            s0, r, stop_c, act = env.step(a)
            ep_reward_opt += r
            obs_ref.append(s0)
            if (stop_c):
                break

        # Get reward using regRL algorithm
        s = env.reset()

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            lambda_mix = 15.
            lambda_store[j] = lambda_mix

            # Get control prior
            a_prior = env.getPrior()

            # Rl control with exploration noise
            ab = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            # Mix the actions (RL controller + control prior)
            act = ab[0] / (1 + lambda_mix) + (lambda_mix /
                                              (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, act = env.step(act)
            collisions.append(env.collision_flag)
            act = np.array(act, ndmin=1)

            # Add info from time step to the replay buffer
            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(ab, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )),
                              np.reshape(a_prior, (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(act)
            act_prior.append(np.array([a_prior]))

            # Collect results at end of episode
            if terminal:
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward-ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                reward_result[2, i] = np.mean(lambda_store)
                reward_result[3, i] = max(collisions)
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 6)),
                    "Observation_ref": np.concatenate(obs_ref).reshape(
                        (-1, 6)),
                    "Action": np.concatenate(action),
                    "Action_Prior": np.concatenate(act_prior),
                    "Action_Prior_Ref": np.concatenate(prior_ref),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)

                break

    return [summary_ops, summary_vars, paths]
Exemple #6
0
def train(sess, env, args, actor, critic, actor_noise, reward_result):

    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()

    sess.run(tf.global_variables_initializer())

    # Get dynamics and initialize prior controller
    [A, B] = get_linear_dynamics()
    prior = BasePrior(A, B)
    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    paths = list()

    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0.
        ep_ave_max_q = 0

        obs, action, rewards = [], [], []

        #Get optimal reward using optimal control
        s0 = np.copy(s)
        ep_reward_opt = 0.
        for kk in range(int(args['max_episode_len'])):
            a_prior = prior.getControl_h(s0)
            a = a_prior
            s0, r, stop_c, _ = env.step(a)
            ep_reward_opt += r
            if (stop_c):
                break

        # Get reward using regRL algorithm
        env.reset()
        s = env.unwrapped.reset(s)

        for j in range(int(args['max_episode_len'])):

            # Set control prior regularization weight
            lambda_mix = 5.

            # Prior control
            a_prior = prior.getControl_h(s)

            # Rl control with exploration noise
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()
            #a = actor.predict(np.reshape(s, (1, actor.s_dim))) + (1. / (1. + i))

            # Mix the actions (RL controller + control prior)
            act = a[0] / (1 + lambda_mix) + (lambda_mix /
                                             (1 + lambda_mix)) * a_prior

            # Take action and observe next state/reward
            s2, r, terminal, info = env.step(act)

            # Add info from time step to the replay buffer
            replay_buffer.add(
                np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )),
                r, terminal, np.reshape(s2, (actor.s_dim, )),
                np.reshape((lambda_mix / (1 + lambda_mix)) * a_prior,
                           (actor.a_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            if replay_buffer.size() > int(args['minibatch_size']):

                #Sample a batch from the replay buffer
                s_batch, a_batch_0, r_batch, t_batch, s2_batch, a_prior_batch = \
                    replay_buffer.sample_batch(int(args['minibatch_size']))

                a_batch = a_batch_0

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))
                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))
                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)
                actor.train(s_batch, grads[0])

                # Update target networks
                actor.update_target_network()
                critic.update_target_network()

                # Calculate TD-Error for each state
                base_q = critic.predict_target(s_batch,
                                               actor.predict_target(s_batch))
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

            s = s2
            ep_reward += r

            obs.append(s)
            rewards.append(r)
            action.append(a[0])

            # Collect results at end of episode
            if terminal:
                for ii in range(len(obs)):
                    obs[ii] = obs[ii].reshape((4, 1))
                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward - ep_reward_opt), \
                        i, (ep_ave_max_q / float(j))))
                reward_result[0, i] = ep_reward
                reward_result[1, i] = ep_reward_opt
                path = {
                    "Observation": np.concatenate(obs).reshape((-1, 4)),
                    "Action": np.concatenate(action),
                    "Reward": np.asarray(rewards)
                }
                paths.append(path)
                print(ep_reward)
                break

    return [summary_ops, summary_vars, paths]
Exemple #7
0
                               TIMESTAMP)

    #env = gym.make(ENVIRONMENT)
    env = allCars()
    #env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT), video_callable=None)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Get prior and set tuning parameters for adaptive regularization weight
    prior = BasePrior()
    lambda_store = np.zeros(BATCH + 1)
    lambda_all = np.zeros(EP_MAX + 1)
    lambda_max = 8
    factor = 0.2

    reward_total, reward_diff = [], []

    for episode in range(EP_MAX + 1):

        # Baseline reward using only control prior
        sp = env.reset_inc()
        reward_prior = 0.
        while True:
            a_prior = env.getPrior()
            sp, reward_p, done_p, _ = env.step(a_prior)
Exemple #8
0
    TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
    SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT, TIMESTAMP)

    env = gym.make(ENVIRONMENT)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Initialize control prior
    [A,B] = get_linear_dynamics()
    prior = BasePrior(A,B)
    # Set fixed regularization weight
    # lambda_mix = 4.

    reward_total, reward_diff, reward_lqr_prior, reward_h_prior = [], [], [], []

    for episode in range(EP_MAX + 1):

        # Baseline reward using only control prior
        s0 = env.reset()
        sp = np.copy(s0)
        reward_prior = 0.
        while True:
            a_prior = prior.getControl_h(sp)
            a_prior = np.squeeze(np.asarray(a_prior))
            sp, reward_p, done_p, _ = env.step(a_prior)