def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, info = env.step(action) state = get_state(obs) env.render() env.close()
def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) rgb = env.render('rgb_array') upscaled = repeat_upsample(rgb, 3, 4) viewer.imshow(upscaled) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, _ = env.step(action) if reward != 0: print(reward) state = get_state(obs) env.close()
proj = la.svd(proj, full_matrices=False)[2] enc_dim = proj.shape[0] weights = np.load(p_dir + "weights.npz") biases = np.load(p_dir + "biases.npz") weights = [v for k, v in weights.items()] biases = [v for k, v in biases.items()] saveload_path = "./experiments/learned_controllers/pendulum/{}".format(i) model = DDPG.load(saveload_path + "model") # now let's test the model # specify the test task n_test_steps = 100 # restart the env env = TimeLimit(RestartablePendulumEnv(), max_episode_steps=200) env = EncoderWrapper(env, mlp_encoder, [weights, biases, proj]) # for each test state, start the env in the state, then run forward and collect rewards for k in range(3): high = np.array([np.pi, 1]) start_state = np.random.uniform(low=-high, high=high) obs = env.reset(state=start_state) for j in range(n_test_steps): action, _states = model.predict(obs) obs, reward, dones, info = env.step(action) env.render() # clean up and save results env.close() del model
def main(k): path = './direction_BS_woNorm/150/{}'.format(k) if not os.path.exists(path): os.makedirs(path) ############## Hyperparameters ############## env_name = "fishEvasion-v0" # used when creating the environment with gym.make render = False # render the environment in training if true # solved_reward = 100 # stop training if avg_reward > solved_reward log_interval = 27 # print avg reward in the interval max_episodes = 10000 # max training episodes max_timesteps = 150 # max timesteps in one episode update_timestep = 4050 # update policy every n timesteps action_std = 0.5 # constant std for action distribution (Multivariate Normal) K_epochs = 80 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO gamma = 0.99 # discount factor lr = 0.0003 # parameters for Adam optimizer betas = (0.9, 0.999) random_seed = None ############################################# # creating environment env = fish.FishEvasionEnv(dt = 0.1) # set the length of an episode from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=max_timesteps) # get observation and action dimensions from the environment state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] if random_seed: print("Random Seed: {}".format(random_seed)) torch.manual_seed(random_seed) env.seed(random_seed) np.random.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip) # ------------------------------------------------------------------ # start training from an existing policy # ppo.policy_old.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ppo.policy.load_state_dict(torch.load('./direction_policy/PPO_{}_{:06d}.pth'.format(env_name,4380),map_location=device)) # ------------------------------------------------------------------ # logging variables running_reward = 0 avg_length = 0 time_step = 0 # training loop for i_episode in range(1, max_episodes+1): # ------------------------------------------------------------------ # set a specific distribution for beta # beta0 = angle_normalize(i_episode*3,center = 0) # print(beta0) # ------------------------------------------------------------------ state = env.reset() for t in range(max_timesteps): time_step +=1 # Running policy_old: action = ppo.select_action(state, memory) state, reward, done, _ = env.step(action) # Storing reward and is_terminals: memory.rewards.append(reward) memory.is_terminals.append(done) # update if it is time # ------------------------------------------------------------------ if time_step % update_timestep == 0: ppo.update(memory) memory.clear_memory() time_step = 0 # ------------------------------------------------------------------ running_reward += reward if render: env.render() # break if episode ends if done: break avg_length += t # ------------------------------------------------------------------ # stop training if avg_reward > solved_reward # if running_reward > (log_interval*solved_reward): # print("########## Solved! ##########") # torch.save(ppo.policy.state_dict(), './PPO_continuous_forwardWoPos_solved_{}.pth'.format(env_name)) # break # ------------------------------------------------------------------ # save every 50 episodes if i_episode % 50 == 0: torch.save(ppo.policy.state_dict(), path+'/PPO_{}_direction{:06d}.pth'.format(env_name,i_episode)) # ------------------------------------------------------------------ # logging if i_episode % log_interval == 0: avg_length = int(avg_length/log_interval) running_reward = ((running_reward/log_interval)) print('Episode {} \t Avg length: {} \t Avg reward: {}'.format(i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
class Worker(object): def __init__(self, name, globalAC, hard_share=None, soft_sharing_coeff_actor=0.0, soft_sharing_coeff_critic=0.0, gradient_clip_actor=0.0, gradient_clip_critic=0.0, debug=False, max_ep_steps=200, image_shape=None, stack=1): self.env = gym.make(GAME).unwrapped self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps) self.name = name self.AC = ACNet(name, globalAC, hard_share=hard_share, soft_sharing_coeff_actor=soft_sharing_coeff_actor, soft_sharing_coeff_critic=soft_sharing_coeff_critic, gradient_clip_actor=gradient_clip_actor, gradient_clip_critic=gradient_clip_critic, image_shape=image_shape, stack=stack) self.debug = debug self.image_shape = image_shape self.stack = stack def work(self): def get_img(fn, *args): img_lock.acquire() results = fn(*args) img = self.env.render(mode='rgb_array') img_lock.release() img = rgb2grey(img) img = resize(img, self.image_shape) return img, results def env_reset_obs(): return self.env.reset() def env_reset_img(): img, _ = get_img(env_reset_obs) return img def env_step_obs(a): return self.env.step(a) def env_step_img(a): img, results = get_img(env_step_obs, a) return img, results[1], results[2], results[3] if self.image_shape is not None: env_reset_fn = env_reset_img env_step_fn = env_step_img else: env_reset_fn = env_reset_obs env_step_fn = env_step_obs global GLOBAL_RUNNING_R, GLOBAL_R, GLOBAL_EP, MAX_GLOBAL_EP total_step = 1 buffer_s, buffer_a, buffer_r = [], [], [] while not COORD.should_stop() and GLOBAL_EP < MAX_GLOBAL_EP: s = env_reset_fn() buffer_s = [s] * self.stack ep_r = 0 while True: a = self.AC.choose_action(buffer_s[-self.stack:]) s_, r, done, info = env_step_fn(a) if done: r = -5 ep_r += r buffer_s.append(s) buffer_a.append(a) buffer_r.append(r) if total_step % UPDATE_GLOBAL_ITER == 0 or done: # update global and assign to local net if done: v_s_ = 0 # terminal else: obs_hist = buffer_s[-(self.stack - 1):] + [ s_, ] feed_dict = { var: obs[np.newaxis, :] for var, obs in zip(self.AC.s, obs_hist) } v_s_ = SESS.run(self.AC.v, feed_dict=feed_dict)[0, 0] buffer_v_target = [] for r in buffer_r[::-1]: # reverse buffer r v_s_ = r + GAMMA * v_s_ buffer_v_target.append(v_s_) buffer_v_target.reverse() if self.image_shape is not None: buffer_s_ = [ buffer_s_[np.newaxis, :] for buffer_s_ in buffer_s ] else: buffer_s_ = copy.deepcopy(buffer_s) obs_columns = [ np.vstack(buffer_s_[idx:-(self.stack - idx)]) for idx in range(self.stack) ] buffer_a, buffer_v_target = np.array(buffer_a), np.vstack( buffer_v_target) feed_dict = { var: obs for var, obs in zip(self.AC.s, obs_columns) } feed_dict[self.AC.a_his] = buffer_a feed_dict[self.AC.v_target] = buffer_v_target if self.debug and self.name == 'W_0': a_loss, c_loss, t_td, c_loss, t_log_prob, t_exp_v, t_entropy, t_exp_v2, a_loss, a_grads, c_grads = self.AC.get_stats( feed_dict) #print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss.shape, " ", c_loss, "\ttd: ", t_td.shape, " ", t_td, "\tlog_prob: ", t_log_prob.shape, " ", t_log_prob, "\texp_v: ", t_exp_v.shape, " ", t_exp_v, "\tentropy: ", t_entropy.shape, " ", t_entropy, "\texp_v2: ", t_exp_v2.shape, " ", t_exp_v2, "\ta_grads: ", [np.sum(weights) for weights in a_grads], "\tc_grads: ", [np.sum(weights) for weights in c_grads]) print("a_loss: ", a_loss.shape, " ", a_loss, "\tc_loss: ", c_loss) c_loss, a_loss, entropy = self.AC.update_global(feed_dict) #import ipdb; ipdb.set_trace() buffer_s, buffer_a, buffer_r = buffer_s[-( self.stack):], [], [] self.AC.pull_global() s = s_ total_step += 1 if done: GLOBAL_R.append(ep_r) if len(GLOBAL_RUNNING_R ) == 0: # record running episode reward GLOBAL_RUNNING_R.append(ep_r) else: GLOBAL_RUNNING_R.append(0.99 * GLOBAL_RUNNING_R[-1] + 0.01 * ep_r) log_lock.acquire() logger.record_tabular("global_ep", GLOBAL_EP) logger.record_tabular("name", self.name) logger.record_tabular("ep_r", ep_r) logger.record_tabular("ep_r_weighted", GLOBAL_RUNNING_R[-1]) logger.record_tabular("c_loss", c_loss) logger.record_tabular("a_loss", a_loss) logger.record_tabular("entropy", entropy) logger.dump_tabular() log_lock.release() GLOBAL_EP += 1 break