def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, info = env.step(action)
        state = get_state(obs)
        env.render()
    env.close()
Example #2
0
def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        rgb = env.render('rgb_array')
        upscaled = repeat_upsample(rgb, 3, 4)
        viewer.imshow(upscaled)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, _ = env.step(action)
        if reward != 0:
            print(reward)
        state = get_state(obs)
    env.close()
Example #3
0
proj = la.svd(proj, full_matrices=False)[2]
enc_dim = proj.shape[0]
weights = np.load(p_dir + "weights.npz")
biases = np.load(p_dir + "biases.npz")
weights = [v for k, v in weights.items()]
biases = [v for k, v in biases.items()]

saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i)
model = DDPG.load(saveload_path + "model")

# now let's test the model
# specify the test task
n_test_steps = 100

# restart the env
env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200)
env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj])

# for each test state, start the env in the state, then run forward and collect rewards
for k in range(3):
    high = np.array([np.pi, 1])
    start_state = np.random.uniform(low=-high, high=high)
    obs = env.reset(state=start_state)
    for j in range(n_test_steps):
        action, _states = model.predict(obs)
        obs, reward, dones, info = env.step(action)
        env.render()

# clean up and save results
env.close()
del model
Example #4
0
def main(k):
    path = './direction_BS_woNorm/150/{}'.format(k)
    if not os.path.exists(path):
        os.makedirs(path)
    ############## Hyperparameters ##############
    env_name = "fishEvasion-v0" # used when creating the environment with gym.make
    render = False              # render the environment in training if true
    # solved_reward = 100         # stop training if avg_reward > solved_reward
    log_interval = 27           # print avg reward in the interval
    max_episodes = 10000        # max training episodes
    max_timesteps = 150         # max timesteps in one episode
    
    update_timestep = 4050      # update policy every n timesteps
    action_std = 0.5            # constant std for action distribution (Multivariate Normal)
    K_epochs = 80               # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    gamma = 0.99                # discount factor
    
    lr = 0.0003                 # parameters for Adam optimizer
    betas = (0.9, 0.999)
    
    random_seed = None
    #############################################
    
    # creating environment
    env = fish.FishEvasionEnv(dt = 0.1)

    # set the length of an episode
    from gym.wrappers.time_limit import TimeLimit
    env = TimeLimit(env, max_episode_steps=max_timesteps)

    # get observation and action dimensions from the environment
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    if random_seed:
        print("Random Seed: {}".format(random_seed))
        torch.manual_seed(random_seed)
        env.seed(random_seed)
        np.random.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip)
    # ------------------------------------------------------------------
    # start training from an existing policy    
    # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device))
    # ------------------------------------------------------------------
    
    # logging variables
    running_reward = 0
    avg_length = 0
    time_step = 0

    # training loop
    for i_episode in range(1, max_episodes+1):
        # ------------------------------------------------------------------
        # set a specific distribution for beta 
        # beta0 = angle_normalize(i_episode*3,center = 0)
        # print(beta0)
        # ------------------------------------------------------------------
        state = env.reset()
        for t in range(max_timesteps):
            time_step +=1
            # Running policy_old:
            action = ppo.select_action(state, memory)
            state, reward, done, _ = env.step(action)

            # Storing reward and is_terminals:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if it is time
            # ------------------------------------------------------------------
            if time_step % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                time_step = 0
            # ------------------------------------------------------------------
            running_reward += reward
            if render:
                env.render()
            # break if episode ends
            if done:
                break
        avg_length += t

        # ------------------------------------------------------------------
        # stop training if avg_reward > solved_reward
        # if running_reward > (log_interval*solved_reward):
        #     print("########## Solved! ##########")
        #     torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name))
        #     break
        # ------------------------------------------------------------------
    
        # save every 50 episodes
        if i_episode % 50 == 0:
            torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) 

        # ------------------------------------------------------------------
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = ((running_reward/log_interval))
            print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
class Worker(object):
    def __init__(self,
                 name,
                 globalAC,
                 hard_share=None,
                 soft_sharing_coeff_actor=0.0,
                 soft_sharing_coeff_critic=0.0,
                 gradient_clip_actor=0.0,
                 gradient_clip_critic=0.0,
                 debug=False,
                 max_ep_steps=200,
                 image_shape=None,
                 stack=1):
        self.env = gym.make(GAME).unwrapped
        self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps)
        self.name = name
        self.AC = ACNet(name,
                        globalAC,
                        hard_share=hard_share,
                        soft_sharing_coeff_actor=soft_sharing_coeff_actor,
                        soft_sharing_coeff_critic=soft_sharing_coeff_critic,
                        gradient_clip_actor=gradient_clip_actor,
                        gradient_clip_critic=gradient_clip_critic,
                        image_shape=image_shape,
                        stack=stack)
        self.debug = debug
        self.image_shape = image_shape
        self.stack = stack

    def work(self):
        def get_img(fn, *args):
            img_lock.acquire()
            results = fn(*args)
            img = self.env.render(mode='rgb_array')
            img_lock.release()
            img = rgb2grey(img)
            img = resize(img, self.image_shape)
            return img, results

        def env_reset_obs():
            return self.env.reset()

        def env_reset_img():
            img, _ = get_img(env_reset_obs)
            return img

        def env_step_obs(a):
            return self.env.step(a)

        def env_step_img(a):
            img, results = get_img(env_step_obs, a)
            return img, results[1], results[2], results[3]

        if self.image_shape is not None:
            env_reset_fn = env_reset_img
            env_step_fn = env_step_img
        else:
            env_reset_fn = env_reset_obs
            env_step_fn = env_step_obs

        global GLOBAL_RUNNING_R, GLOBAL_R, GLOBAL_EP, MAX_GLOBAL_EP
        total_step = 1
        buffer_s, buffer_a, buffer_r = [], [], []
        while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP:
            s = env_reset_fn()
            buffer_s = [s] * self.stack
            ep_r = 0
            while True:
                a = self.AC.choose_action(buffer_s[-self.stack:])
                s_, r, done, info = env_step_fn(a)
                if done: r = -5
                ep_r += r
                buffer_s.append(s)
                buffer_a.append(a)
                buffer_r.append(r)

                if total_step % UPDATE_GLOBAL_ITER == 0 or done:  # update global and assign to local net

                    if done:
                        v_s_ = 0  # terminal
                    else:
                        obs_hist = buffer_s[-(self.stack - 1):] + [
                            s_,
                        ]
                        feed_dict = {
                            var: obs[np.newaxis, :]
                            for var, obs in zip(self.AC.s, obs_hist)
                        }
                        v_s_ = SESS.run(self.AC.v, feed_dict=feed_dict)[0, 0]

                    buffer_v_target = []
                    for r in buffer_r[::-1]:  # reverse buffer r
                        v_s_ = r + GAMMA * v_s_
                        buffer_v_target.append(v_s_)
                    buffer_v_target.reverse()

                    if self.image_shape is not None:
                        buffer_s_ = [
                            buffer_s_[np.newaxis, :] for buffer_s_ in buffer_s
                        ]
                    else:
                        buffer_s_ = copy.deepcopy(buffer_s)
                    obs_columns = [
                        np.vstack(buffer_s_[idx:-(self.stack - idx)])
                        for idx in range(self.stack)
                    ]
                    buffer_a, buffer_v_target = np.array(buffer_a), np.vstack(
                        buffer_v_target)
                    feed_dict = {
                        var: obs
                        for var, obs in zip(self.AC.s, obs_columns)
                    }
                    feed_dict[self.AC.a_his] = buffer_a
                    feed_dict[self.AC.v_target] = buffer_v_target
                    if self.debug and self.name == 'W_0':
                        a_loss, c_loss, t_td, c_loss, t_log_prob, t_exp_v, t_entropy, t_exp_v2, a_loss, a_grads, c_grads = self.AC.get_stats(
                            feed_dict)
                        #print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss.shape, " ", c_loss, "\ttd: ", t_td.shape, " ", t_td, "\tlog_prob: ", t_log_prob.shape, " ", t_log_prob, "\texp_v: ", t_exp_v.shape, " ", t_exp_v, "\tentropy: ", t_entropy.shape, " ", t_entropy, "\texp_v2: ", t_exp_v2.shape, " ", t_exp_v2, "\ta_grads: ", [np.sum(weights) for weights in a_grads], "\tc_grads: ", [np.sum(weights) for weights in c_grads])
                        print("a_loss: ", a_loss.shape, " ", a_loss,
                              "\tc_loss: ", c_loss)
                    c_loss, a_loss, entropy = self.AC.update_global(feed_dict)

                    #import ipdb; ipdb.set_trace()
                    buffer_s, buffer_a, buffer_r = buffer_s[-(
                        self.stack):], [], []
                    self.AC.pull_global()

                s = s_
                total_step += 1
                if done:
                    GLOBAL_R.append(ep_r)
                    if len(GLOBAL_RUNNING_R
                           ) == 0:  # record running episode reward
                        GLOBAL_RUNNING_R.append(ep_r)
                    else:
                        GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] +
                                                0.01 * ep_r)

                    log_lock.acquire()
                    logger.record_tabular("global_ep", GLOBAL_EP)
                    logger.record_tabular("name", self.name)
                    logger.record_tabular("ep_r", ep_r)
                    logger.record_tabular("ep_r_weighted",
                                          GLOBAL_RUNNING_R[-1])
                    logger.record_tabular("c_loss", c_loss)
                    logger.record_tabular("a_loss", a_loss)
                    logger.record_tabular("entropy", entropy)
                    logger.dump_tabular()
                    log_lock.release()

                    GLOBAL_EP += 1
                    break