def rollout(self, net): """ rollout handles the actual rollout of the environment for n steps in time. net - torch Module object. This is the model to interact with the environment. """ net.eval() state = next_state(self.env, self.obs_deque, obs=None, reset=True, preprocess=self.hyps['preprocess']) ep_rew = 0 hyps = self.hyps is_recurrent = hasattr(net, "fresh_h") if not is_recurrent: h = None else: h = net.fresh_h() t = 0 episode_count = 1 while t <= 400: t += 1 state = cuda_if(torch.FloatTensor(state)) if is_recurrent: val, logits, h = net(state[None], h=cuda_if(h.detach().data)) else: val, logits = net(state[None]) if self.hyps['discrete_env']: probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) else: mu, sig = logits action = mu + torch.randn_like(sig) * sig action = action.cpu().detach().numpy().squeeze() if len(action.shape) == 0: action = np.asarray([float(action)]) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() ep_rew += rew reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: episode_count += 1 state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) return ep_rew / episode_count, ep_rew / t
def run(self, net): """ run is the entry function to begin collecting rollouts from the environment using the specified net. gate_q indicates when to begin collecting a rollout and is controlled from the main process. The stop_q is used to indicate to the main process that a new rollout has been collected. net - torch Module object. This is the model to interact with the environment. """ self.net = net self.env = gym.make(self.hyps['env_type']) state = next_state(self.env, self.obs_deque, obs=None, reset=True, preprocess=self.hyps['preprocess']) self.state_bookmark = state self.h_bookmark = None if self.net.is_recurrent: self.h_bookmark = Variable(cuda_if(torch.zeros(1, self.net.h_size))) self.ep_rew = 0 #self.net.train(mode=False) # fixes potential batchnorm and dropout issues for p in self.net.parameters(): # Turn off gradient collection p.requires_grad = False while True: idx = self.gate_q.get() # Opened from main process self.rollout(self.net, idx, self.hyps) self.stop_q.put( idx) # Signals to main process that data has been collected
def test(self): ob = self.test_env.reset() done = False ep_reward = 0 last_action = np.array([-1]) action_repeat = 0 while not done: ob = np.array(ob) ob = torch.from_numpy(ob.transpose((2, 0, 1))).float().unsqueeze(0) ob = Variable(ob / 255., volatile=True) ob = cuda_if(ob, self.cuda) pi, v = self.policy(ob) _, action = torch.max(pi, dim=1) # abort after {self.test_repeat_max} discrete action repeats if action.data[0] == last_action.data[0]: action_repeat += 1 if action_repeat == self.test_repeat_max: return ep_reward else: action_repeat = 0 last_action = action ob, reward, done, _ = self.test_env.step(action.data.cpu().numpy()) ep_reward += reward return ep_reward
def interact(self): """ Interacts with the environment Returns: obs (ArgumentDefaultsHelpFormatternsor): observations shaped [T + 1 x N x ...] rewards (FloatTensor): rewards shaped [T x N x 1] masks (FloatTensor): continuation masks shaped [T x N x 1] zero at done timesteps, one otherwise actions (LongTensor): discrete actions shaped [T x N x 1] steps (int): total number of steps taken """ N = self.num_workers T = self.worker_steps # TEMP needs to be generalized, does conv-specific transpose for PyTorch obs = torch.zeros(T + 1, N, 4, 84, 84) obs = cuda_if(obs, self.cuda) rewards = torch.zeros(T, N, 1) rewards = cuda_if(rewards, self.cuda) masks = torch.zeros(T, N, 1) masks = cuda_if(masks, self.cuda) actions = torch.zeros(T, N, 1).long() actions = cuda_if(actions, self.cuda) for t in range(T): # interaction logic ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float() ob = Variable(ob / 255.) ob = cuda_if(ob, self.cuda) obs[t] = ob.data pi, v = self.policy(ob) u = cuda_if(torch.rand(pi.size()), self.cuda) _, action = torch.max(pi.data - (-u.log()).log(), 1) action = action.unsqueeze(1) actions[t] = action self.last_ob, reward, done, _ = self.venv.step( action.cpu().numpy()) reward = torch.from_numpy(reward).unsqueeze(1) rewards[t] = torch.clamp(reward, min=-1., max=1.) masks[t] = mask = torch.from_numpy((1. - done)).unsqueeze(1) ob = torch.from_numpy(self.last_ob.transpose((0, 3, 1, 2))).float() ob = Variable(ob / 255.) ob = cuda_if(ob, self.cuda) obs[T] = ob.data steps = N * T return obs, rewards, masks, actions, steps
def gae(self, rewards, values, next_vals, dones, gamma, lambda_): """ Performs Generalized Advantage Estimation rewards - torch FloatTensor of actual rewards collected. Size = L values - torch FloatTensor of value predictions. Size = L next_vals - torch FloatTensor of value predictions. Size = L dones - torch FloatTensor of done signals. Size = L gamma - float discount factor lambda_ - float gae moving average factor Returns advantages - torch FloatTensor of genralized advantage estimations. Size = L """ deltas = rewards + gamma * next_vals * (1 - dones) - values return cuda_if(discount(deltas, dones, gamma * lambda_))
def rollout(self, net, idx, hyps): """ rollout handles the actual rollout of the environment for n steps in time. It is called from run and performs a single rollout, placing the collected data into the shared lists found in the datas dict. net - torch Module object. This is the model to interact with the environment. idx - int identification number distinguishing the portion of the shared array designated for this runner hyps - dict object with all necessary hyperparameters Keys (Assume string type keys): "gamma" - reward decay coeficient "n_tsteps" - number of steps to be taken in the environment "n_frame_stack" - number of frames to stack for creation of the mdp state "preprocess" - function to preprocess raw observations """ state = self.state_bookmark h = self.h_bookmark n_tsteps = hyps['n_tsteps'] startx = idx * n_tsteps prev_val = None for i in range(n_tsteps): self.datas['states'][startx + i] = cuda_if( torch.FloatTensor(state)) state_in = Variable(self.datas['states'][startx + i]).unsqueeze(0) if 'h_states' in self.datas: self.datas['h_states'][startx + i] = h.data[0] h_in = Variable(h.data) val, logits, h = net(state_in, h_in) else: val, logits = net(state_in) probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() self.ep_rew += rew reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew) self.ep_rew = 0 # Reset Recurrence if h is not None: h = Variable(cuda_if(torch.zeros(1, self.net.h_size))) self.datas['rewards'][startx + i] = rew self.datas['dones'][startx + i] = float(done) self.datas['actions'][startx + i] = action state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) if i > 0: prev_rew = self.datas['rewards'][startx + i - 1] prev_done = self.datas['dones'][startx + i - 1] delta = prev_rew + hyps['gamma'] * val.data * ( 1 - prev_done) - prev_val self.datas['deltas'][startx + i - 1] = delta prev_val = val.data.squeeze() # Funky bootstrapping endx = startx + n_tsteps - 1 if not done: state_in = Variable(cuda_if(torch.FloatTensor(state))).unsqueeze(0) if 'h_states' in self.datas: val, logits, _ = net(state_in, Variable(h.data)) else: val, logits = net(state_in) self.datas['rewards'][endx] += hyps['gamma'] * val.squeeze( ) # Bootstrap self.datas['dones'][endx] = 1. self.datas['deltas'][endx] = self.datas['rewards'][endx] - prev_val self.state_bookmark = state if h is not None: self.h_bookmark = h.data
cuda = torch.cuda.is_available() and not args.no_cuda env_fns = [] for rank in range(args.num_workers): env_fns.append(lambda: make_env(args.env_id, rank, args.seed + rank)) if args.render: venv = RenderSubprocVecEnv(env_fns, args.render_interval) else: venv = SubprocVecEnv(env_fns) venv = VecFrameStack(venv, 4) test_env = make_env(args.env_id, 0, args.seed) test_env = FrameStack(test_env, 4) policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n) policy = cuda_if(policy, cuda) optimizer = optim.Adam(policy.parameters()) if args.lr_func == 'linear': lr_func = lambda a: args.lr * (1. - a) elif args.lr_func == 'constant': lr_func = lambda a: args.lr if args.clip_func == 'linear': clip_func = lambda a: args.clip * (1. - a) elif args.clip_func == 'constant': clip_func = lambda a: args.clip algorithm = PPO(policy, venv,
def run(self, total_steps): """ Runs PPO Args: total_steps (int): total number of environment steps to run for """ N = self.num_workers T = self.worker_steps E = self.opt_epochs A = self.venv.action_space.n while self.taken_steps < total_steps: progress = self.taken_steps / total_steps obs, rewards, masks, actions, steps = self.interact() ob_shape = obs.size()[2:] ep_reward = self.test() self.reward_histr.append(ep_reward) self.steps_histr.append(self.taken_steps) # statistic logic group_size = len(self.steps_histr) // self.plot_points if self.plot_reward and len(self.steps_histr) % ( self.plot_points * 10) == 0 and group_size >= 10: x_means, _, y_means, y_stds = \ mean_std_groups(np.array(self.steps_histr), np.array(self.reward_histr), group_size) fig = plt.figure() fig.set_size_inches(8, 6) plt.ticklabel_format(axis='x', style='sci', scilimits=(-2, 6)) plt.errorbar(x_means, y_means, yerr=y_stds, ecolor='xkcd:blue', fmt='xkcd:black', capsize=5, elinewidth=1.5, mew=1.5, linewidth=1.5) plt.title('Training progress') plt.xlabel('Total steps') plt.ylabel('Episode reward') plt.savefig(self.plot_path, dpi=200) plt.clf() plt.close() plot_timer = 0 # TEMP upgrade to support recurrence # compute advantages, returns with GAE obs_ = obs.view(((T + 1) * N, ) + ob_shape) obs_ = Variable(obs_) _, values = self.policy(obs_) values = values.view(T + 1, N, 1) advantages, returns = gae(rewards, masks, values, self.gamma, self.lambd) self.policy_old.load_state_dict(self.policy.state_dict()) for e in range(E): self.policy.zero_grad() MB = steps // self.minibatch_steps b_obs = Variable(obs[:T].view((steps, ) + ob_shape)) b_rewards = Variable(rewards.view(steps, 1)) b_masks = Variable(masks.view(steps, 1)) b_actions = Variable(actions.view(steps, 1)) b_advantages = Variable(advantages.view(steps, 1)) b_returns = Variable(returns.view(steps, 1)) b_inds = np.arange(steps) np.random.shuffle(b_inds) for start in range(0, steps, self.minibatch_steps): mb_inds = b_inds[start:start + self.minibatch_steps] mb_inds = cuda_if( torch.from_numpy(mb_inds).long(), self.cuda) mb_obs, mb_rewards, mb_masks, mb_actions, mb_advantages, mb_returns = \ [arr[mb_inds] for arr in [b_obs, b_rewards, b_masks, b_actions, b_advantages, b_returns]] mb_pis, mb_vs = self.policy(mb_obs) mb_pi_olds, mb_v_olds = self.policy_old(mb_obs) mb_pi_olds, mb_v_olds = mb_pi_olds.detach( ), mb_v_olds.detach() losses = self.objective(self.clip_func(progress), mb_pis, mb_vs, mb_pi_olds, mb_v_olds, mb_actions, mb_advantages, mb_returns) policy_loss, value_loss, entropy_loss = losses loss = policy_loss + value_loss * self.value_coef + entropy_loss * self.entropy_coef set_lr(self.optimizer, self.lr_func(progress)) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(self.policy.parameters(), self.max_grad_norm) self.optimizer.step() self.taken_steps += steps print(self.taken_steps)
def rollout(self, net, idx, hyps): """ rollout handles the actual rollout of the environment for n steps in time. It is called from run and performs a single rollout, placing the collected data into the shared lists found in the datas dict. net - torch Module object. This is the model to interact with the environment. idx - int identification number distinguishing the portion of the shared array designated for this runner hyps - dict object with all necessary hyperparameters Keys (Assume string type keys): "gamma" - reward decay coeficient "n_tsteps" - number of steps to be taken in the environment "n_frame_stack" - number of frames to stack for creation of the mdp state "preprocess" - function to preprocess raw observations """ net.eval() hyps = self.hyps state = self.state_bookmark n_tsteps = hyps['n_tsteps'] is_recurrent = hasattr(net, "fresh_h") if not is_recurrent: h = None else: h = self.prev_h if self.prev_h is not None else net.fresh_h() startx = idx * n_tsteps for i in range(n_tsteps): self.datas['states'][startx + i] = cuda_if( torch.FloatTensor(state)) if is_recurrent: self.datas["hs"][startx + i] = cuda_if(h.detach().data) val, logits, h = net(self.datas['states'][startx + i][None], h=self.datas['hs'][startx + i][None]) self.datas["next_hs"][startx + i] = cuda_if(h.detach().data) else: val, logits = net(self.datas['states'][startx + i][None]) if self.hyps['discrete_env']: probs = F.softmax(logits, dim=-1) action = sample_action(probs.data) action = int(action.item()) else: mu, sig = logits action = mu + torch.randn_like(sig) * sig action = action.cpu().detach().numpy().squeeze() if len(action.shape) == 0: action = np.asarray([float(action)]) obs, rew, done, info = self.env.step(action + hyps['action_shift']) if hyps['render']: self.env.render() self.ep_rew += rew self.datas['rews'][startx + i] = float(rew) reset = done if "Pong" in hyps['env_type'] and rew != 0: done = True if done: self.rew_q.put(.99 * self.rew_q.get() + .01 * self.ep_rew) self.ep_rew = 0 self.datas['dones'][startx + i] = 0 if isinstance(action, np.ndarray): action = cuda_if(torch.from_numpy(action)) self.datas['actions'][startx + i] = action state = next_state(self.env, self.obs_deque, obs=obs, reset=reset, preprocess=hyps['preprocess']) if i > 0: self.datas['next_states'][startx + i - 1] = self.datas['states'][startx + i] endx = startx + n_tsteps - 1 self.datas['next_states'][endx] = cuda_if(torch.FloatTensor(state)) self.datas['dones'][endx] = 1. self.state_bookmark = state if h is not None: self.prev_h = h.data
def update_model(self, shared_data): """ This function accepts the data collected from a rollout and performs Q value update iterations on the neural net. shared_data - dict of torch tensors with shared memory to collect data. Each tensor contains indices from idx*n_tsteps to (idx+1)*n_tsteps Keys (assume string keys): "states" - MDP states at each timestep t type: FloatTensor shape: (n_states, *state_shape) "deltas" - gae deltas collected at timestep t+1 type: FloatTensor shape: (n_states,) "h_states" - Recurrent states at timestep t+1 type: FloatTensor shape: (n_states, h_size) "rewards" - Collects float rewards collected at each timestep t type: FloatTensor shape: (n_states,) "dones" - Collects the dones collected at each timestep t type: FloatTensor shape: (n_states,) "actions" - Collects actions performed at each timestep t type: LongTensor shape: (n_states,) """ hyps = self.hyps net = self.net net.req_grads(True) states = shared_data['states'] rewards = shared_data['rewards'] dones = shared_data['dones'] actions = shared_data['actions'] deltas = shared_data['deltas'] advs = cuda_if( discount(deltas.squeeze(), dones.squeeze(), hyps['gamma'] * hyps['lambda_'])) # Forward Pass if 'h_states' in shared_data: h_states = Variable(cuda_if(shared_data['h_states'])) if hyps['use_bptt']: vals, logits = self.bptt(states, h_states, dones) else: vals, logits, _ = net(Variable(cuda_if(states)), h_states) else: vals, logits = net(Variable(cuda_if(states))) # Log Probabilities log_softs = F.log_softmax(logits, dim=-1) logprobs = log_softs[torch.arange(len(actions)).long(), actions] # Returns if hyps['use_nstep_rets']: returns = advs + vals.data.squeeze() else: returns = cuda_if( discount(rewards.squeeze(), dones.squeeze(), hyps['gamma'])) # Advantages if hyps['norm_advs']: advs = (advs - advs.mean()) / (advs.std() + 1e-6) # A2C Losses pi_loss = -(logprobs.squeeze() * Variable(advs.squeeze())).mean() val_loss = hyps['val_coef'] * F.mse_loss(vals.squeeze(), returns) entr_loss = -hyps['entr_coef'] * ( (log_softs * F.softmax(logits, dim=-1)).sum(-1)).mean() loss = pi_loss + val_loss - entr_loss loss.backward() self.norm = nn.utils.clip_grad_norm_(net.parameters(), hyps['max_norm']) self.optim.step() self.optim.zero_grad() self.info = { "Loss": loss.item(), "Pi_Loss": pi_loss.item(), "ValLoss": val_loss.item(), "Entropy": entr_loss.item(), "GradNorm": self.norm.item() } return self.info
def train(self, hyps): """ hyps - dictionary of required hyperparameters type: dict """ # Initial settings if "randomizeObjs" in hyps: assert False, "you mean randomizeObs, not randomizeObjs" if "audibleTargs" in hyps and hyps['audibleTargs'] > 0: hyps['aud_targs'] = True if verbose: print("Using audible targs!") countOut = try_key(hyps, 'countOut', 0) if countOut and not hyps['endAtOrigin']: assert False, "endAtOrigin must be true for countOut setting" # Print Hyperparameters To Screen items = list(hyps.items()) for k, v in sorted(items): print(k+":", v) # Make Save Files if "save_folder" in hyps: save_folder = hyps['save_folder'] else: save_folder = "./saved_data/" if not os.path.exists(save_folder): os.mkdir(save_folder) base_name = save_folder + hyps['exp_name'] net_save_file = base_name+"_net.p" fwd_save_file = base_name+"_fwd.p" best_net_file = base_name+"_best.p" optim_save_file = base_name+"_optim.p" fwd_optim_file = base_name+"_fwdoptim.p" hyps['fwd_emb_file'] = base_name+"_fwdemb.p" if hyps['inv_model'] is not None: inv_save_file = base_name+"_invnet.p" reconinv_optim_file = base_name+"_reconinvoptim.p" else: inv_save_file = None reconinv_optim_file = None if hyps['recon_model'] is not None: recon_save_file = base_name+"_reconnet.p" reconinv_optim_file = base_name+"_reconinvoptim.p" else: recon_save_file = None log_file = base_name+"_log.txt" if hyps['resume']: log = open(log_file, 'a') else: log = open(log_file, 'w') for k, v in sorted(items): log.write(k+":"+str(v)+"\n") # Miscellaneous Variable Prep logger = Logger() shared_len = hyps['n_tsteps']*hyps['n_rollouts'] float_params = dict() if "float_params" not in hyps: try: keys = hyps['game_keys'] hyps['float_params'] = {k:try_key(hyps,k,0) for k in keys} if "minObjLoc" not in hyps: hyps['float_params']["minObjLoc"] = 0.27 hyps['float_params']["maxObjLoc"] = 0.73 float_params = hyps['float_params'] except: pass env = SeqEnv(hyps['env_type'], hyps['seed'], worker_id=None, float_params=float_params) hyps['discrete_env'] = hasattr(env.action_space, "n") obs = env.reset() prepped = hyps['preprocess'](obs) hyps['state_shape'] = [hyps['n_frame_stack']*prepped.shape[0], *prepped.shape[1:]] if not hyps['discrete_env']: action_size = int(np.prod(env.action_space.shape)) elif hyps['env_type'] == "Pong-v0": action_size = 3 else: action_size = env.action_space.n hyps['action_shift'] = (4-action_size)*(hyps['env_type']=="Pong-v0") print("Obs Shape:,",obs.shape) print("Prep Shape:,",prepped.shape) print("State Shape:,",hyps['state_shape']) print("Num Samples Per Update:", shared_len) if not (hyps['n_cache_refresh'] <= shared_len or hyps['cache_size'] == 0): hyps['n_cache_refresh'] = shared_len print("Samples Wasted in Update:", shared_len % hyps['batch_size']) try: env.close() except: pass del env # Prepare Shared Variables shared_data = { 'states': torch.zeros(shared_len, *hyps['state_shape']).share_memory_(), 'next_states': torch.zeros(shared_len, *hyps['state_shape']).share_memory_(), 'dones':torch.zeros(shared_len).share_memory_(), 'rews':torch.zeros(shared_len).share_memory_(), 'hs':torch.zeros(shared_len,hyps['h_size']).share_memory_(), 'next_hs':torch.zeros(shared_len,hyps['h_size']).share_memory_()} if hyps['discrete_env']: shared_data['actions'] = torch.zeros(shared_len).long().share_memory_() else: shape = (shared_len, action_size) shared_data['actions']=torch.zeros(shape).float().share_memory_() shared_data = {k: cuda_if(v) for k,v in shared_data.items()} n_rollouts = hyps['n_rollouts'] gate_q = mp.Queue(n_rollouts) stop_q = mp.Queue(n_rollouts) end_q = mp.Queue(1) reward_q = mp.Queue(1) reward_q.put(-1) # Make Runners runners = [] for i in range(hyps['n_envs']): runner = Runner(shared_data, hyps, gate_q, stop_q, end_q, reward_q) runners.append(runner) # Make the Networks h_size = hyps['h_size'] net = hyps['model'](hyps['state_shape'], action_size, h_size, bnorm=hyps['use_bnorm'], lnorm=hyps['use_lnorm'], discrete_env=hyps['discrete_env']) # Fwd Dynamics hyps['is_recurrent'] = hasattr(net, "fresh_h") intl_size = h_size+action_size + hyps['is_recurrent']*h_size if hyps['fwd_lnorm']: block = [nn.LayerNorm(intl_size)] block = [nn.Linear(intl_size, h_size), nn.ReLU(), nn.Linear(h_size, h_size), nn.ReLU(), nn.Linear(h_size, h_size)] fwd_net = nn.Sequential(*block) # Allows us to argue an h vector along with embedding to # forward func if hyps['is_recurrent']: fwd_net = CatModule(fwd_net) if hyps['ensemble']: fwd_net = Ensemble(fwd_net) fwd_net = cuda_if(fwd_net) if hyps['inv_model'] is not None: inv_net = hyps['inv_model'](h_size, action_size) inv_net = cuda_if(inv_net) else: inv_net = None if hyps['recon_model'] is not None: recon_net = hyps['recon_model'](emb_size=h_size, img_shape=hyps['state_shape'], fwd_bnorm=hyps['fwd_bnorm'], deconv_ksizes=hyps['recon_ksizes']) recon_net = cuda_if(recon_net) else: recon_net = None if hyps['resume']: net.load_state_dict(torch.load(net_save_file)) fwd_net.load_state_dict(torch.load(fwd_save_file)) if inv_net is not None: inv_net.load_state_dict(torch.load(inv_save_file)) if recon_net is not None: recon_net.load_state_dict(torch.load(recon_save_file)) base_net = copy.deepcopy(net) net = cuda_if(net) net.share_memory() base_net = cuda_if(base_net) hyps['is_recurrent'] = hasattr(net, "fresh_h") # Start Data Collection print("Making New Processes") procs = [] for i in range(len(runners)): proc = mp.Process(target=runners[i].run, args=(net,)) procs.append(proc) proc.start() print(i, "/", len(runners), end='\r') for i in range(n_rollouts): gate_q.put(i) # Make Updater updater = Updater(base_net, fwd_net, hyps, inv_net, recon_net) if hyps['resume']: updater.optim.load_state_dict(torch.load(optim_save_file)) updater.fwd_optim.load_state_dict(torch.load(fwd_optim_file)) if inv_net is not None: updater.reconinv_optim.load_state_dict(torch.load(reconinv_optim_file)) updater.optim.zero_grad() updater.net.train(mode=True) updater.net.req_grads(True) # Prepare Decay Precursors entr_coef_diff = hyps['entr_coef'] - hyps['entr_coef_low'] epsilon_diff = hyps['epsilon'] - hyps['epsilon_low'] lr_diff = hyps['lr'] - hyps['lr_low'] gamma_diff = hyps['gamma_high'] - hyps['gamma'] # Training Loop past_rews = deque([0]*hyps['n_past_rews']) last_avg_rew = 0 best_rew_diff = 0 best_avg_rew = -10000 best_eval_rew = -10000 ep_eval_rew = 0 eval_rew = 0 epoch = 0 done_count = 0 T = 0 try: while T < hyps['max_tsteps']: basetime = time.time() epoch += 1 # Collect data for i in range(n_rollouts): stop_q.get() T += shared_len # Reward Stats avg_reward = reward_q.get() reward_q.put(avg_reward) last_avg_rew = avg_reward done_count += shared_data['dones'].sum().item() new_best = False if avg_reward > best_avg_rew and done_count > n_rollouts: new_best = True best_avg_rew = avg_reward updater.save_model(best_net_file, fwd_save_file, None, None) eval_rew = shared_data['rews'].mean() if eval_rew > best_eval_rew: best_eval_rew = eval_rew save_names = [net_save_file, fwd_save_file, optim_save_file, fwd_optim_file, inv_save_file, recon_save_file, reconinv_optim_file] for i in range(len(save_names)): if save_names[i] is not None: splt = save_names[i].split(".") splt[0] = splt[0]+"_best" save_names[i] = ".".join(splt) updater.save_model(*save_names) s = "EvalRew: {:.5f} | BestEvalRew: {:.5f}" print(s.format(eval_rew, best_eval_rew)) # Calculate the Loss and Update nets updater.update_model(shared_data) net.load_state_dict(updater.net.state_dict()) # update all collector nets # Resume Data Collection for i in range(n_rollouts): gate_q.put(i) # Decay HyperParameters if hyps['decay_eps']: updater.epsilon = (1-T/(hyps['max_tsteps']))*epsilon_diff + hyps['epsilon_low'] print("New Eps:", updater.epsilon) if hyps['decay_lr']: new_lr = (1-T/(hyps['max_tsteps']))*lr_diff + hyps['lr_low'] updater.new_lr(new_lr) print("New lr:", new_lr) if hyps['decay_entr']: updater.entr_coef = entr_coef_diff*(1-T/(hyps['max_tsteps']))+hyps['entr_coef_low'] print("New Entr:", updater.entr_coef) if hyps['incr_gamma']: updater.gamma = gamma_diff*(T/(hyps['max_tsteps']))+hyps['gamma'] print("New Gamma:", updater.gamma) # Periodically save model if epoch % 10 == 0 or epoch == 1: updater.save_model(net_save_file, fwd_save_file, optim_save_file, fwd_optim_file, inv_save_file, recon_save_file, reconinv_optim_file) # Print Epoch Data past_rews.popleft() past_rews.append(avg_reward) max_rew, min_rew = deque_maxmin(past_rews) print("Epoch", epoch, "– T =", T, "-- Folder:", base_name) if not hyps['discrete_env']: s = ("{:.5f} | "*net.logsigs.shape[1]) s = s.format(*[x.item() for x in torch.exp(net.logsigs[0])]) print("Sigmas:", s) updater.print_statistics() avg_action = shared_data['actions'].float().mean().item() print("Grad Norm:",float(updater.norm),"– Avg Action:",avg_action,"– Best AvgRew:",best_avg_rew) print("Avg Rew:", avg_reward, "– High:", max_rew, "– Low:", min_rew, end='\n') updater.log_statistics(log, T, avg_reward, avg_action, best_avg_rew) updater.info['AvgRew'] = avg_reward updater.info['EvalRew'] = eval_rew logger.append(updater.info, x_val=T) # Check for memory leaks gc.collect() max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("Time:", time.time()-basetime) if 'hyp_search_count' in hyps and hyps['hyp_search_count'] > 0 and hyps['search_id'] != None: print("Search:", hyps['search_id'], "/", hyps['hyp_search_count']) print("Memory Used: {:.2f} memory\n".format(max_mem_used / 1024)) if updater.info["VLoss"] == float('inf') or updater.norm == float('inf'): break except KeyboardInterrupt: pass end_q.put(1) time.sleep(1) logger.make_plots(base_name) log.write("\nBestRew:"+str(best_avg_rew)) log.close() # Close processes for p in procs: p.terminate() return best_avg_rew
action_size = 3 else: action_size = env.action_space.n hyps['action_shift'] = (4 - action_size) * (hyps['env_type'] == "Pong-v0") print("Obs Shape:,", obs.shape) print("Prep Shape:,", prepped.shape) print("State Shape:,", hyps['state_shape']) del env # Make Network net = hyps['model'](hyps['state_shape'], action_size, h_size=hyps['h_size'], bnorm=hyps['use_bnorm']) net.load_state_dict(torch.load(file_name)) net = cuda_if(net) # Prepare Shared Variables shared_len = hyps['n_tsteps'] shared_data = { 'states': cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()), 'deltas': cuda_if(torch.zeros(shared_len).share_memory_()), 'rewards': cuda_if(torch.zeros(shared_len).share_memory_()), 'actions': torch.zeros(shared_len).long().share_memory_(), 'dones': cuda_if(torch.zeros(shared_len).share_memory_()) }
default='ep_reward.png', help='path to save reward plot to') parser.add_argument('--seed', type=int, default=0, help='random seed') args = parser.parse_args() set_seed(args.seed) cuda = torch.cuda.is_available() and not args.no_cuda test_env = make_env(args.env_id, 0, args.seed) test_env = FrameStack(test_env, 4) policy = {'cnn': AtariCNN}[args.arch](venv.action_space.n) checkpoint = torch.load("./save/PPO_" + self.env_name + ".pt") policy.load_check_point(checkpoint["policy"]) policy = cuda_if(policy, cuda) ob = self.test_env.reset() done = False ep_reward = 0 last_action = np.array([-1]) action_repeat = 0 while not done: ob = np.array(ob) ob = torch.from_numpy(ob.transpose((2, 0, 1))).float().unsqueeze(0) ob = Variable(ob / 255., volatile=True) ob = cuda_if(ob, self.cuda) pi, v = policy(ob) _, action = torch.max(pi, dim=1)
def train(self, hyps): """ hyps - dictionary of required hyperparameters type: dict """ # Print Hyperparameters To Screen items = list(hyps.items()) for k, v in sorted(items): print(k + ":", v) # Make Save Files if "save_folder" in hyps: save_folder = hyps['save_folder'] else: save_folder = "./saved_data/" if not os.path.exists(save_folder): os.mkdir(save_folder) base_name = save_folder + hyps['exp_name'] net_save_file = base_name + "_net.p" best_net_file = base_name + "_best.p" optim_save_file = base_name + "_optim.p" log_file = base_name + "_log.txt" if hyps['resume']: log = open(log_file, 'a') else: log = open(log_file, 'w') for k, v in sorted(items): log.write(k + ":" + str(v) + "\n") # Miscellaneous Variable Prep logger = Logger() shared_len = hyps['n_tsteps'] * hyps['n_rollouts'] env = gym.make(hyps['env_type']) obs = env.reset() prepped = hyps['preprocess'](obs) hyps['state_shape'] = [hyps['n_frame_stack']] + [*prepped.shape[1:]] if hyps['env_type'] == "Pong-v0": action_size = 3 else: action_size = env.action_space.n hyps['action_shift'] = (4 - action_size) * (hyps['env_type'] == "Pong-v0") print("Obs Shape:,", obs.shape) print("Prep Shape:,", prepped.shape) print("State Shape:,", hyps['state_shape']) print("Num Samples Per Update:", shared_len) del env # Make Network net = hyps['model'](hyps['state_shape'], action_size, h_size=hyps['h_size'], bnorm=hyps['use_bnorm']) if hyps['resume']: net.load_state_dict(torch.load(net_save_file)) base_net = copy.deepcopy(net) net = cuda_if(net) net.share_memory() base_net = cuda_if(base_net) # Prepare Shared Variables shared_data = { 'states': cuda_if( torch.zeros(shared_len, *hyps['state_shape']).share_memory_()), 'deltas': cuda_if(torch.zeros(shared_len).share_memory_()), 'rewards': cuda_if(torch.zeros(shared_len).share_memory_()), 'actions': torch.zeros(shared_len).long().share_memory_(), 'dones': cuda_if(torch.zeros(shared_len).share_memory_()) } if net.is_recurrent: shared_data['h_states'] = cuda_if( torch.zeros(shared_len, net.h_size).share_memory_()) n_rollouts = hyps['n_rollouts'] gate_q = mp.Queue(n_rollouts) stop_q = mp.Queue(n_rollouts) reward_q = mp.Queue(1) reward_q.put(-1) # Make Runners runners = [] for i in range(hyps['n_envs']): runner = Runner(shared_data, hyps, gate_q, stop_q, reward_q) runners.append(runner) # Start Data Collection print("Making New Processes") procs = [] for i in range(len(runners)): proc = mp.Process(target=runners[i].run, args=(net, )) procs.append(proc) proc.start() print(i, "/", len(runners), end='\r') for i in range(n_rollouts): gate_q.put(i) # Make Updater updater = Updater(base_net, hyps) if hyps['resume']: updater.optim.load_state_dict(torch.load(optim_save_file)) updater.optim.zero_grad() updater.net.train(mode=True) updater.net.req_grads(True) # Prepare Decay Precursors entr_coef_diff = hyps['entr_coef'] - hyps['entr_coef_low'] lr_diff = hyps['lr'] - hyps['lr_low'] gamma_diff = hyps['gamma_high'] - hyps['gamma'] # Training Loop past_rews = deque([0] * hyps['n_past_rews']) last_avg_rew = 0 best_avg_rew = -100 epoch = 0 T = 0 while T < hyps['max_tsteps']: basetime = time.time() epoch += 1 # Collect data for i in range(n_rollouts): stop_q.get() T += shared_len # Reward Stats avg_reward = reward_q.get() reward_q.put(avg_reward) last_avg_rew = avg_reward if avg_reward > best_avg_rew: best_avg_rew = avg_reward updater.save_model(best_net_file, None) # Calculate the Loss and Update nets updater.update_model(shared_data) net.load_state_dict( updater.net.state_dict()) # update all collector nets # Resume Data Collection for i in range(n_rollouts): gate_q.put(i) # Decay HyperParameters if hyps['decay_lr']: decay_factor = max((1 - T / (hyps['max_tsteps'])), 0) new_lr = decay_factor * lr_diff + hyps['lr_low'] updater.new_lr(new_lr) print("New lr:", new_lr) if hyps['decay_entr']: decay_factor = max((1 - T / (hyps['max_tsteps'])), 0) updater.entr_coef = entr_coef_diff * decay_factor + hyps[ 'entr_coef_low'] print("New Entr:", updater.entr_coef) # Periodically save model if epoch % 10 == 0: updater.save_model(net_save_file, optim_save_file) # Print Epoch Data past_rews.popleft() past_rews.append(avg_reward) max_rew, min_rew = deque_maxmin(past_rews) rew_avg, rew_std = np.mean(past_rews), np.std(past_rews) updater.print_statistics() avg_action = shared_data['actions'].float().mean().item() print("Epoch", epoch, "– T =", T) print("Grad Norm:", float(updater.norm), "– Avg Action:", avg_action, "– Best AvgRew:", best_avg_rew) print("Avg Rew:", avg_reward) print("Past " + str(hyps['n_past_rews']) + "Rews – High:", max_rew, "– Low:", min_rew, "– Avg:", rew_avg, "– StD:", rew_std) updater.log_statistics(log, T, avg_reward, avg_action, best_avg_rew) updater.info['AvgRew'] = avg_reward logger.append(updater.info, x_val=T) # Check for memory leaks gc.collect() max_mem_used = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss print("Time:", time.time() - basetime) if 'hyp_search_count' in hyps and hyps[ 'hyp_search_count'] > 0 and hyps['search_id'] != None: print("Search:", hyps['search_id'], "/", hyps['hyp_search_count']) print("Memory Used: {:.2f} memory\n".format(max_mem_used / 1024)) logger.make_plots(base_name) log.write("\nBestRew:" + str(best_avg_rew)) log.close() # Close processes for p in procs: p.terminate() return best_avg_rew
prepped = hyps['preprocess'](obs) hyps['state_shape'] = [hyps['n_frame_stack']] + [*prepped.shape[1:]] if hyps['env_type'] == "Pong-v0": action_size = 3 else: action_size = env.action_space.n hyps['action_shift'] = (4-action_size)*(hyps['env_type']=="Pong-v0") print("Obs Shape:,",obs.shape) print("Prep Shape:,",prepped.shape) print("State Shape:,",hyps['state_shape']) del env # Make Network net = hyps['model'](hyps['state_shape'], action_size, h_size=hyps['h_size'], bnorm=hyps['use_bnorm']) net.load_state_dict(torch.load(file_name)) net = cuda_if(net) # Prepare Shared Variables shared_len = hyps['n_tsteps'] shared_data = {'states': cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()), 'next_states': cuda_if(torch.zeros(shared_len, *hyps['state_shape']).share_memory_()), 'rewards': cuda_if(torch.zeros(shared_len).share_memory_()), 'actions': torch.zeros(shared_len).long().share_memory_(), 'dones': cuda_if(torch.zeros(shared_len).share_memory_())} gate_q = mp.Queue(1) stop_q = mp.Queue(1) reward_q = mp.Queue(1) reward_q.put(-1) # Make Runner runner = Runner(shared_data, hyps, gate_q, stop_q, reward_q)