def initialise_policy(self): # initialise policy network policy_net = Policy( args=self.args, # pass_state_to_policy=self.args.pass_state_to_policy, pass_latent_to_policy=self.args.pass_latent_to_policy, pass_belief_to_policy=self.args.pass_belief_to_policy, pass_task_to_policy=self.args.pass_task_to_policy, dim_state=self.args.state_dim, dim_latent=self.args.latent_dim * 2, dim_belief=self.args.belief_dim, dim_task=self.args.task_dim, # hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, policy_initialisation=self.args.policy_initialisation, # action_space=self.envs.action_space, init_std=self.args.policy_init_std, ).to(device) # initialise policy trainer if self.args.policy == 'a2c': policy = A2C( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, optimiser_vae=self.vae.optimiser_vae, lr=self.args.lr_policy, eps=self.args.policy_eps, ) elif self.args.policy == 'ppo': policy = PPO( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, optimiser_vae=self.vae.optimiser_vae, ) else: raise NotImplementedError return policy
def initialise_policy(self): if hasattr(self.envs.action_space, 'low'): action_low = self.envs.action_space.low action_high = self.envs.action_space.high else: action_low = action_high = None # initialise policy network policy_net = Policy( args=self.args, # pass_state_to_policy=self.args.pass_state_to_policy, pass_latent_to_policy= False, # use metalearner.py if you want to use the VAE pass_belief_to_policy=self.args.pass_belief_to_policy, pass_task_to_policy=self.args.pass_task_to_policy, dim_state=self.args.state_dim, dim_latent=0, dim_belief=self.args.belief_dim, dim_task=self.args.task_dim, # hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, policy_initialisation=self.args.policy_initialisation, # action_space=self.envs.action_space, init_std=self.args.policy_init_std, norm_actions_of_policy=self.args.norm_actions_of_policy, action_low=action_low, action_high=action_high, ).to(device) # initialise policy trainer if self.args.policy == 'a2c': policy = A2C( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ) elif self.args.policy == 'ppo': policy = PPO( self.args, policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, policy_optimiser=self.args.policy_optimiser, policy_anneal_lr=self.args.policy_anneal_lr, train_steps=self.num_updates, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, ) else: raise NotImplementedError return policy
env = gym.make(env_name) seed = 123 env.seed(seed) torch.manual_seed(seed) log_interval = 10 lr = 1e-3 state_dim = env.observation_space.shape[0] action_dim = env.action_space.n print('Env Name: %s | Seed: %d | State_dim: %d | Action_dim: %d | Algo: %s ' % (env_name, seed, state_dim, action_dim, algorithm_name)) if algorithm_name == 'a2c': model = A2C(state_dim, action_dim, lr=lr) elif algorithm_name == 'trajcv': model = TrajCVPolicy(state_dim, action_dim, lr=lr) else: raise NotImplementedError('Not such algorithm.') plotter = Plotter("%s_%s_plot" % (algorithm_name, env_name), log_interval) def main(): running_reward = 0 for i_episode in range(501): state = env.reset() ep_reward = 0
def main(): # make the environments if args.num_envs == 1: env = [gym.make(args.env_name)] else: env = [gym.make(args.env_name) for i in range(args.num_envs)] env = MultiGym(env, render=args.render) n_states = env.observation_space.shape n_actions = env.action_space.n print('state shape:', n_states, 'actions:', n_actions) policy = ConvPolicy(n_actions).to(device) optimizer = optim.RMSprop(policy.parameters(), lr=args.lr) if args.algo == 'ppo': sys.path.append('../') from algorithms.ppo import PPO update_algo = PPO(policy=policy, optimizer=optimizer, num_steps=args.num_steps, num_envs=args.num_envs, state_size=(4, 105, 80), entropy_coef=args.entropy, gamma=args.gamma, device=device, epochs=args.ppo_epochs) else: sys.path.append('../') from algorithms.a2c import A2C update_algo = A2C(policy=policy, optimizer=optimizer, num_steps=args.num_steps, num_envs=args.num_envs, state_size=(4, 105, 80), entropy_coef=args.entropy, gamma=args.gamma, device=device) end_rewards = [] try: print('starting episodes') idx = 0 d = False reward_sum = np.zeros((args.num_envs)) restart = True frame = env.reset() mask = torch.ones(args.num_envs) all_start = time.time() for update_idx in range(args.num_updates): update_algo.policy.train() # stack the frames s = train_state_proc.proc_state(frame, mask=mask) # insert state before getting actions update_algo.states[0].copy_(s) start = time.time() for step in range(args.num_steps): with torch.no_grad(): # get probability dist and values p, v = update_algo.policy(update_algo.states[step]) a = Categorical(p).sample() # take action get response frame, r, d = env.step( a.cpu().numpy() if args.num_envs > 1 else [a.item()]) s = train_state_proc.proc_state(frame, mask) update_algo.insert_experience(step=step, s=s, a=a, v=v, r=r, d=d) mask = torch.tensor(1. - d).float() reward_sum = (reward_sum + r) # if any episode finished append episode reward to list if d.any(): end_rewards.extend(reward_sum[d]) # reset any rewards that finished reward_sum = reward_sum * mask.numpy() idx += 1 with torch.no_grad(): _, next_val = update_algo.policy(update_algo.states[-1]) update_algo.update(next_val.view(1, args.num_envs).to(device), next_mask=mask.to(device)) if args.lr_decay: for params in update_algo.optimizer.param_groups: params['lr'] = ( lr_min + 0.5 * (args.lr - lr_min) * (1 + np.cos(np.pi * idx / args.num_updates))) # update every so often by displaying results in term if (update_idx % args.log_interval == 0) and (len(end_rewards) > 0): total_steps = (idx + 1) * args.num_envs * args.num_steps end = time.time() print(end_rewards[-10:]) print('Updates {}\t Time: {:.4f} \t FPS: {}'.format( update_idx, end - start, int(total_steps / (end - all_start)))) print( 'Mean Episode Rewards: {:.2f} \t Min/Max Current Rewards: {}/{}' .format(np.mean(end_rewards[-10:]), reward_sum.min(), reward_sum.max())) except KeyboardInterrupt: pass torch.save( update_algo.policy.state_dict(), '../model_weights/{}_{}_conv.pth'.format(args.env_name, args.algo)) import pandas as pd out_dict = {'avg_end_rewards': end_rewards} out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/{}_{}_rewards.csv'.format(args.env_name, args.algo), index=False) out_dict = { 'actor losses': update_algo.actor_losses, 'critic losses': update_algo.critic_losses, 'entropy': update_algo.entropy_logs } out_log = pd.DataFrame(out_dict) out_log.to_csv('../logs/{}_{}_training_behavior.csv'.format( args.env_name, args.algo), index=False) plt.plot(end_rewards) plt.show()
def worker(worker_id, algorithm_name, seed, return_dict): print('Worker %d (pid: %d) has started: algorithm_name <%s> seed <%d>.' % (worker_id, os.getpid(), algorithm_name, seed)) env = gym.make(env_name) env.seed(seed) torch.manual_seed(seed) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n if algorithm_name == 'a2c': model = A2C(state_dim, action_dim, lr=lr, gamma=gamma, v_update_epochs=v_update_epochs, v_update_anneal=v_update_anneal, epsilon_greedy_threshold=epsilon_greedy_threshold, epsilon_anneal=epsilon_anneal, re_sample_batch_size=re_sample_batch_size) elif algorithm_name == 'trajcv': model = TrajCVPolicy(state_dim, action_dim, lr=lr, gamma=gamma, v_update_epochs=v_update_epochs, v_update_anneal=v_update_anneal, epsilon_greedy_threshold=epsilon_greedy_threshold, epsilon_anneal=epsilon_anneal, re_sample_batch_size=re_sample_batch_size) else: raise NotImplementedError('Not such algorithm.') reward_records = [] running_reward = 0 for i_episode in range(400): state = env.reset() ep_reward = 0 for t in range(1, 10000): action = model.select_action(state) state, reward, done, _ = env.step(action) model.save_reward(reward) model.save_state_action(state, action, done) ep_reward += reward if done: break running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward reward_records.append(running_reward) model.finish_episode() if i_episode % log_interval == 0: print( '{:>10}(pid:{:>4}-worker_id:{:>2})|Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}' .format(algorithm_name, seed, worker_id, i_episode, ep_reward, running_reward)) env.close() return_dict[worker_id] = reward_records print('Worker %d has ended.' % worker_id)
def initialise_policy(self): # variables for task encoder (used for oracle) state_dim = self.envs.observation_space.shape[0] # TODO: this isn't ideal, find a nicer way to get the task dimension! if 'BeliefOracle' in self.args.env_name: task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \ gym.make(self.args.env_name.replace('BeliefOracle', '')).observation_space.shape[0] latent_dim = self.args.latent_dim state_embedding_size = self.args.state_embedding_size use_task_encoder = True elif 'Oracle' in self.args.env_name: task_dim = gym.make(self.args.env_name).observation_space.shape[0] - \ gym.make(self.args.env_name.replace('Oracle', '')).observation_space.shape[0] latent_dim = self.args.latent_dim state_embedding_size = self.args.state_embedding_size use_task_encoder = True else: task_dim = latent_dim = state_embedding_size = 0 use_task_encoder = False # initialise rollout storage for the policy self.policy_storage = OnlineStorage( self.args, self.args.policy_num_steps, self.args.num_processes, self.args.obs_dim, self.args.act_space, hidden_size=0, latent_dim=self.args.latent_dim, normalise_observations=self.args.norm_obs_for_policy, normalise_rewards=self.args.norm_rew_for_policy, ) if hasattr(self.envs.action_space, 'low'): action_low = self.envs.action_space.low action_high = self.envs.action_space.high else: action_low = action_high = None # initialise policy network policy_net = Policy( # general state_dim=int(self.args.condition_policy_on_state) * state_dim, action_space=self.envs.action_space, init_std=self.args.policy_init_std, hidden_layers=self.args.policy_layers, activation_function=self.args.policy_activation_function, use_task_encoder=use_task_encoder, # task encoding things (for oracle) task_dim=task_dim, latent_dim=latent_dim, state_embed_dim=state_embedding_size, # normalise_actions=self.args.normalise_actions, action_low=action_low, action_high=action_high, ).to(device) # initialise policy if self.args.policy == 'a2c': # initialise policy trainer (A2C) self.policy = A2C( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, lr=self.args.lr_policy, eps=self.args.policy_eps, alpha=self.args.a2c_alpha, ) elif self.args.policy == 'ppo': # initialise policy network self.policy = PPO( policy_net, self.args.policy_value_loss_coef, self.args.policy_entropy_coef, lr=self.args.lr_policy, eps=self.args.policy_eps, ppo_epoch=self.args.ppo_num_epochs, num_mini_batch=self.args.ppo_num_minibatch, use_huber_loss=self.args.ppo_use_huberloss, use_clipped_value_loss=self.args.ppo_use_clipped_value_loss, clip_param=self.args.ppo_clip_param, ) else: raise NotImplementedError
state_size=n_states, entropy_coef=args.entropy, gamma=args.gamma, device=device, recurrent=True, rnn_size=args.hid_size, epochs=args.ppo_epochs, batch_size=args.batch_size) else: sys.path.append('../') from algorithms.a2c import A2C update_algo = A2C(policy=policy, optimizer=optimizer, num_steps=args.num_steps, num_envs=args.num_envs, state_size=n_states, entropy_coef=args.entropy, gamma=args.gamma, device=device, recurrent=True, rnn_size=args.hid_size) end_rewards = [] gt = 0 def main(): try: print('starting episodes') d = False idx = 0 episodes = 0