def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.value(last_observations).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) dataset = Dataset(D, logprobs, entropies, Vs, Qs, As) dataloader = DataLoader(dataset, self.config['train.batch_size'], shuffle=True) for epoch in range(self.config['train.num_epochs']): logs = [self.learn_one_update(data) for data in dataloader] self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.policy_lr_scheduler.get_lr() out['policy_grad_norm'] = np.mean( [item['policy_grad_norm'] for item in logs]) out['value_grad_norm'] = np.mean( [item['value_grad_norm'] for item in logs]) out['policy_loss'] = np.mean([item['policy_loss'] for item in logs]) out['policy_entropy'] = np.mean( [item['policy_entropy'] for item in logs]) out['value_loss'] = np.mean([item['value_loss'] for item in logs]) out['explained_variance'] = np.mean( [item['explained_variance'] for item in logs]) out['approx_kl'] = np.mean([item['approx_kl'] for item in logs]) out['clip_frac'] = np.mean([item['clip_frac'] for item in logs]) return out
def sample(self, batch_size): idx = np.random.randint(0, self.size, size=batch_size) return list( map(lambda x: tensorify(x, self.device), [ self.observations[idx], self.actions[idx], self.rewards[idx], self.next_observations[idx], self.masks[idx] ]))
def learn(self, D, **kwargs): logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D] entropies = [torch.cat(traj.get_infos('entropy')) for traj in D] Vs = [torch.cat(traj.get_infos('V')) for traj in D] last_Vs = [traj.extra_info['last_info']['V'] for traj in D] Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj.rewards, last_V, traj.reach_terminal) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj.rewards, V, last_V, traj.reach_terminal) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-4) assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As.detach() entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([traj.T for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -out['entropy_loss'] out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def run(config, seed, device, logdir): set_global_seeds(seed) print('Initializing...') agent = Agent(config, make_env(config, seed), device) es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], {'popsize': config['train.popsize'], 'seed': seed}) train_logs = [] checkpoint_count = 0 with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): start_time = time.perf_counter() solutions = es.ask() out = list(executor.map(fitness, solutions, chunksize=2)) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation+1) logger('num_seconds', round(time.perf_counter() - start_time, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation+1)%config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-'*50) if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation+1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl') return None
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) features = self.feature_network(obs) action_dist = self.action_head(features) action = action_dist.sample() out = {} out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) return out
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) with torch.no_grad(): if kwargs['mode'] == 'train': action = numpify(self.actor(obs).sample(), 'float') elif kwargs['mode'] == 'eval': action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float') out = {} out['raw_action'] = action.squeeze(0) return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} features = self.feature_network(obs) action_dist = self.action_head(features) out['entropy'] = action_dist.entropy() action = action_dist.sample() out['raw_action'] = numpify(action, 'float') return out
def test_tensorify(): # tensor x = torch.tensor(2.43) y = tensorify(x, 'cpu') assert torch.equal(x, y) del x, y x = torch.randn(10) y = tensorify(x, 'cpu') assert torch.equal(x, y) del x, y x = torch.randn(10, 20, 30) y = tensorify(x, 'cpu') assert torch.equal(x, y) del x, y # ndarray x = np.array(2.43) y = tensorify(x, 'cpu') assert np.allclose(x, y.item()) del x, y x = np.random.randn(10) y = tensorify(x, 'cpu') assert np.allclose(x, y) del x, y x = np.random.randn(10, 20, 30) y = tensorify(x, 'cpu') assert np.allclose(x, y) del x, y # raw list x = [2.43] y = tensorify(x, 'cpu') assert np.allclose(x, y.item()) del x, y x = [1, 2, 3, 4, 5, 6] y = tensorify(x, 'cpu') assert np.allclose(x, y) del x, y x = [[1, 2], [3, 4], [5, 6]] y = tensorify(x, 'cpu') assert np.allclose(x, y) del x, y
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) with torch.no_grad(): action = numpify(self.actor(obs), 'float') if kwargs['mode'] == 'train': eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape) action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high) out = {} out['action'] = action return out
def fitness(data): torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK config, seed, device, param = data env = make_env(config, seed, 'train') agent = Agent(config, env, device) agent.from_vec(tensorify(param, 'cpu')) runner = EpisodeRunner() with torch.no_grad(): D = runner(agent, env, 10) R = np.mean([sum(traj.rewards) for traj in D]) H = np.mean([traj.T for traj in D]) return R, H
def choose_action(self, x, **kwargs): obs = tensorify(x.observation, self.device).unsqueeze(0) features = self.feature_network(obs) action_dist = self.action_head(features) V = self.V_head(features) action = action_dist.sample() out = {} out['action_dist'] = action_dist out['V'] = V out['entropy'] = action_dist.entropy() out['action'] = action out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) out['action_logprob'] = action_dist.log_prob(action.detach()) return out
def run(config, seed, device, logdir): set_global_seeds(seed) torch.set_num_threads(1) # VERY IMPORTANT TO AVOID GETTING STUCK print('Initializing...') agent = Agent(config, make_env(config, seed, 'eval'), device) es = OpenAIES( [config['train.mu0']] * agent.num_params, config['train.std0'], { 'popsize': config['train.popsize'], 'seed': seed, 'sigma_scheduler_args': config['train.sigma_scheduler_args'], 'lr': config['train.lr'], 'lr_decay': config['train.lr_decay'], 'min_lr': config['train.min_lr'], 'antithetic': config['train.antithetic'], 'rank_transform': config['train.rank_transform'] }) train_logs = [] checkpoint_count = 0 with Pool(processes=config['train.popsize'] // config['train.worker_chunksize']) as pool: print('Finish initialization. Training starts...') for generation in range(config['train.generations']): t0 = time.perf_counter() solutions = es.ask() data = [(config, seed, device, solution) for solution in solutions] out = pool.map(CloudpickleWrapper(fitness), data, chunksize=config['train.worker_chunksize']) Rs, Hs = zip(*out) es.tell(solutions, [-R for R in Rs]) logger = Logger() logger('generation', generation + 1) logger('num_seconds', round(time.perf_counter() - t0, 1)) logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n')) logger('fbest', es.result.fbest) train_logs.append(logger.logs) if generation == 0 or (generation + 1) % config['log.freq'] == 0: logger.dump(keys=None, index=0, indent=0, border='-' * 50) if (generation + 1) >= int(config['train.generations'] * (checkpoint_count / (config['checkpoint.num'] - 1))): agent.from_vec(tensorify(es.result.xbest, 'cpu')) agent.checkpoint(logdir, generation + 1) checkpoint_count += 1 pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl') return None
def fitness(param): agent.from_vec(tensorify(param, 'cpu')) R = [] H = [] with torch.no_grad(): for i in range(10): observation = env.reset() for t in range(env.spec.max_episode_steps): action = agent.choose_action(observation)['raw_action'] observation, reward, done, info = env.step(action) if done[0]: R.append(info[0]['episode']['return']) H.append(info[0]['episode']['horizon']) break return np.mean(R), np.mean(H)
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} action_dist = self.policy(obs) out['action_dist'] = action_dist out['entropy'] = action_dist.entropy() action = action_dist.sample() out['action'] = action out['raw_action'] = numpify(action, 'float') out['action_logprob'] = action_dist.log_prob(action.detach()) V = self.value(obs) out['V'] = V return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} features = self.feature_network(obs) action_dist = self.action_head(features) out['action_dist'] = action_dist out['entropy'] = action_dist.entropy() action = action_dist.sample() out['action'] = action out['raw_action'] = numpify(action, 'float') out['action_logprob'] = action_dist.log_prob(action.detach()) V = self.V_head(features) out['V'] = V return out
def choose_action(self, obs, **kwargs): obs = tensorify(obs, self.device) out = {} if kwargs['mode'] == 'train': dist = self.actor(obs) action = dist.rsample() out['action'] = action out['action_logprob'] = dist.log_prob(action) elif kwargs['mode'] == 'stochastic': with torch.no_grad(): out['action'] = numpify(self.actor(obs).sample(), 'float') elif kwargs['mode'] == 'eval': with torch.no_grad(): out['action'] = numpify( torch.tanh(self.actor.mean_forward(obs)), 'float') else: raise NotImplementedError return out
def choose_action(self, x, **kwargs): if x.first(): self.state = self.reset(1) obs = tensorify(x.observation, self.device).unsqueeze(0) obs = obs.unsqueeze(0) # add seq_dim features, [next_state] = self.feature_network(obs, [self.state]) if 'last_info' not in kwargs: self.state = next_state features = features.squeeze(0) # squeeze seq_dim action_dist = self.action_head(features) V = self.V_head(features) action = action_dist.sample() out = {} out['action_dist'] = action_dist out['V'] = V out['entropy'] = action_dist.entropy() out['action'] = action out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0) out['action_logprob'] = action_dist.log_prob(action.detach()) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D] Vs = [torch.cat(traj.get_all_info('V')) for traj in D] with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) Qs = [ bootstrapped_returns(self.config['agent.gamma'], traj, last_V) for traj, last_V in zip(D, last_Vs) ] As = [ gae(self.config['agent.gamma'], self.config['agent.gae_lambda'], traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs) ] # Metrics -> Tensor, device logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(), [logprobs, entropies, Vs]) Qs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [Qs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, Qs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.optimizer.step() self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(Qs, 'float'), y_pred=numpify(Vs, 'float')) return out
def learn(self, D, **kwargs): # Compute all metrics, D: list of Trajectory Ts = [len(traj) for traj in D] behavior_logprobs = [ torch.cat(traj.get_all_info('action_logprob')) for traj in D ] out_agent = self.choose_action( np.concatenate([traj.numpy_observations[:-1] for traj in D], 0)) logprobs = out_agent['action_logprob'].squeeze() entropies = out_agent['entropy'].squeeze() Vs = out_agent['V'].squeeze() with torch.no_grad(): last_observations = tensorify( np.concatenate([traj.last_observation for traj in D], 0), self.device) last_Vs = self.V_head( self.feature_network(last_observations)).squeeze(-1) vs, As = [], [] for traj, behavior_logprob, logprob, V, last_V in zip( D, behavior_logprobs, logprobs.detach().cpu().split(Ts), Vs.detach().cpu().split(Ts), last_Vs): v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards, V, last_V, traj.reach_terminal, self.clip_rho, self.clip_pg_rho) vs.append(v) As.append(A) # Metrics -> Tensor, device vs, As = map( lambda x: tensorify(np.concatenate(x).copy(), self.device), [vs, As]) if self.config['agent.standardize_adv']: As = (As - As.mean()) / (As.std() + 1e-8) assert all( [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]]) # Loss policy_loss = -logprobs * As entropy_loss = -entropies value_loss = F.mse_loss(Vs, vs, reduction='none') loss = policy_loss + self.config[ 'agent.value_coef'] * value_loss + self.config[ 'agent.entropy_coef'] * entropy_loss loss = loss.mean() self.optimizer.zero_grad() loss.backward() grad_norm = nn.utils.clip_grad_norm_( self.parameters(), self.config['agent.max_grad_norm']) self.optimizer.step() if self.config['agent.use_lr_scheduler']: self.lr_scheduler.step(self.total_timestep) self.total_timestep += sum([len(traj) for traj in D]) out = {} if self.config['agent.use_lr_scheduler']: out['current_lr'] = self.lr_scheduler.get_lr() out['loss'] = loss.item() out['grad_norm'] = grad_norm out['policy_loss'] = policy_loss.mean().item() out['entropy_loss'] = entropy_loss.mean().item() out['policy_entropy'] = -entropy_loss.mean().item() out['value_loss'] = value_loss.mean().item() out['V'] = describe(numpify(Vs, 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n') out['explained_variance'] = ev(y_true=numpify(vs, 'float'), y_pred=numpify(Vs, 'float')) return out