def test(env, actor_model): """ Tests the model. Parameters: env - the environment to test the policy on actor_model - the actor model to load in Return: None """ print(f"Testing {actor_model}", flush=True) # If the actor model is not specified, then exit if actor_model == '': print(f"Didn't specify model file. Exiting.", flush=True) sys.exit(0) # Extract out dimensions of observation and action spaces obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Build our policy the same way we build our actor model in PPO policy = FeedForwardNN(obs_dim, act_dim) # Load in the actor model saved by the PPO algorithm policy.load_state_dict(torch.load(actor_model)) # Evaluate our policy with a separate module, eval_policy, to demonstrate # that once we are done training the model/policy with ppo.py, we no longer need # ppo.py since it only contains the training algorithm. The model/policy itself exists # independently as a binary file that can be loaded in with torch. eval_policy(policy=policy, env=env, render=True)
def __init__(self, env, max_steps=1000): self.env = env #Hyper Parametros self.max_steps = max_steps # Fator de desconto self.gamma = 0.99 self.lamda = 0.95 # Quantidade de épocas self.epochs = 10 # Quantidade de passos que tera uma batch self.batch_size = self.max_steps # Tamanho de uma mini batch ! self.mini_batch_size = self.batch_size // 10 # quantidade de atualizações que ocorrerá na politica self.updates = 10 self.max_iterations = 16 # Soma das recompensas self.sum_rewards = 0 self.obs = [] # ! # Inicializa o modelo da rede neural self.model = FeedForwardNN(in_dim=1, out_dim=4, hidden_layer=16) self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4) # Array que contem a loss self.loss = []
def __init__(self, env): # ハイパーパラメータの初期化 self._init_hyperparameters() # 環境の情報の抽出 self.env = env self.obs_dim = env.observation_space.shape[0] self.act_dim = env.action_space.shape[0] # ALG STEP 1 # ActorとCriticのネットワークの初期化 self.actor = FeedForwardNN(self.obs_dim, self.act_dim) self.critic = FeedForwardNN(self.obs_dim, 1) # Optimizerの初期化 self.actor_optim = Adam(self.actor.parameters(), lr=self.lr) self.actor_critic = Adam(self.critic.parameters(), lr=self.lr) # 共分散行列の作成 self.cov_var = torch.full(size=(self.act_dim, ), fill_value=0.5) self.cov_mat = torch.diag(self.cov_var)
def eval_progress(list_dir, file_dir, actor_model, env, obs_dim, act_dim, render=False): for entry in list_dir: ep_num = entry actor_model = file_dir + str(entry) + "/" + actor_model policy = FeedForwardNN(obs_dim, act_dim) ep_len, ep_ret = rollout(policy, env, render) _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num) env.close()
def test(env, datapath, actor_model, mode): print(f"Testing {actor_model}", flush=True) if actor_model == '': print(f"Didn't specify model file. Exiting.", flush=True) sys.exit(0) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] entries = os.listdir(datapath) entries = [int(x) for x in entries] entries.sort() if mode == 'test': policy = FeedForwardNN(obs_dim, act_dim) actor_model = datapath + "/" + str(entries[-1]) + "/" + actor_model policy.load_state_dict(torch.load(actor_model)) print("Iteration " + str(entries[-1])) eval_policy(policy=policy, env=env, render=True) if mode == 'progress': eval_progress(list_dir = entries, file_dir = datapath, actor_model = actor_model,\ obs_dim = obs_dim, act_dim = act_dim, env = env, render = True) '''
class PPO: def __init__(self, env): # ハイパーパラメータの初期化 self._init_hyperparameters() # 環境の情報の抽出 self.env = env self.obs_dim = env.observation_space.shape[0] self.act_dim = env.action_space.shape[0] # ALG STEP 1 # ActorとCriticのネットワークの初期化 self.actor = FeedForwardNN(self.obs_dim, self.act_dim) self.critic = FeedForwardNN(self.obs_dim, 1) # Optimizerの初期化 self.actor_optim = Adam(self.actor.parameters(), lr=self.lr) self.actor_critic = Adam(self.critic.parameters(), lr=self.lr) # 共分散行列の作成 self.cov_var = torch.full(size=(self.act_dim, ), fill_value=0.5) self.cov_mat = torch.diag(self.cov_var) def learn(self, total_timesteps): t_so_far = 0 # これまでにシミュレートされたタイムステップ while t_so_far < total_timesteps: # ALG STEP 2 # ALG STEP 3 batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout( ) # このバッチを収集したタイムステップ数を算出 t_so_far += np.sum(batch_lens) # V_{phi_k}を算出 V, _ = self.evaluate(batch_obs, batch_acts) # ALG STEP 5 # advantageの算出と正規化 A_k = batch_rtgs - V.detach() A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10) for _ in range(self.n_updates_per_iteration): # pi_theta(a_t | s_t)を計算 V, curr_log_probs = self.evaluate(batch_obs, batch_acts) # 比率を計算する ratios = torch.exp(curr_log_probs - batch_log_probs) # 代理損失(サロゲート損失)を算出 surr1 = ratios * A_k surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k # 損失を算出 actor_loss = (-torch.min(surr1, surr2)).mean() critic_loss = nn.MSELoss()(V, batch_rtgs) # 勾配を算出 & Actorネットワークのバックプロバゲーション self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() # 勾配を算出 & Criticネットワークのバックプロバゲーション self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() def evaluate(self, batch_obs, batch_acts): # 各観測値のVについてcriticネットワークに問い合わせる V = self.critic(batch_obs).squeeze() # 最新のアクターネットワークを使用して対数確率を算出 mean = self.actor(batch_obs) dist = MultivariateNormal(mean, self.cov_mat) log_probs = dist.log_prob(batch_acts) return V, log_probs def rollout(self): # バッチデータ batch_obs = [] # 観測値:(observations-バッチごとのタイムステップ数、観測値の次元) batch_acts = [] # アクション:(actions-バッチあたりのタイムステップ数、アクションの次元) batch_log_probs = [] # 対数確率:(log probabilities-バッチあたりのタイムステップ数) batch_rews = [] # 報酬:(rewards-エピソードの数、エピソードごとのタイムステップの数) batch_rtgs = [] # 報酬:(reward to go's-バッチあたりのタイムステップ数) batch_lens = [] # バッチの長さ:(batch length-エピソードの数) # このバッチでこれまでに実行されたタイムステップの数 t = 0 while t < self.timesteps_per_batch: ep_rews = [] obs = self.env.reset() done = False for ep_t in range(self.max_timesteps_per_episode): # これまでのバッチを実行したタイムステップを増やす t += 1 # 観測データの取得 batch_obs.append(obs) action, log_prob = self.get_action(obs) obs, rew, done, _ = self.env.step(action) # 報酬、アクション、対数確率を取得 ep_rews.append(rew) batch_acts.append(action) batch_log_probs.append(log_prob) # 終了判定 if done: break # エピソードの長さと報酬を取得 batch_lens.append(ep_t + 1) batch_rews.append(ep_rews) # データを指定された形状のテンソルに再形成してから返す batch_obs = torch.tensor(batch_obs, dtype=torch.float) batch_acts = torch.tensor(batch_acts, dtype=torch.float) batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float) # ALG STEP 4 batch_rtgs = self.compute_rtgs(batch_rews) # Return the batch data return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens def get_action(self, obs): # 平均的なアクションのためのネットワーク(アクター) # self.actor.forward(obs)を呼び出すのと同じ mean = self.actor(obs) # 多変量正規分布を定義 dist = MultivariateNormal(mean, self.cov_mat) # 分布からアクションをサンプリングして、その対数確率を取得する action = dist.sample() log_prob = dist.log_prob(action) # サンプリングされたアクションと、そのアクションのlog probを返す return action.detach().numpy, log_prob.detach() def compute_rtgs(self, batch_rews): # バッチごと・エピソードごとのリワード(rtg)を返す batch_rtgs = [] # 各エピソードを逆方向に反復し、batch_rtgsで同じ順序を維持 for ep_rews in reversed(batch_rews): discounted_reward = 0 # これまでの割引報酬 for rew in reversed(ep_rews): discounted_reward = rew + discounted_reward * self.gamma batch_rtgs.insert(0, discounted_reward) # rewards-to-goをテンソルに変換 batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float) return batch_rtgs def _init_hyperparameters(self): self.timesteps_per_batch = 4800 # バッチあたりのタイムステップ self.max_timesteps_per_episode = 1600 # エピソードあたりのタイムステップ self.gamma = 0.95 # 割引率 self.n_updates_per_iteration = 5 # イテレーションあたりのエポック self.clip = 0.2 # クリップのしきい値 self.lr = 0.005 # 学習率
class PPO: def __init__(self, env, max_steps=1000): self.env = env #Hyper Parametros self.max_steps = max_steps # Fator de desconto self.gamma = 0.99 self.lamda = 0.95 # Quantidade de épocas self.epochs = 10 # Quantidade de passos que tera uma batch self.batch_size = self.max_steps # Tamanho de uma mini batch ! self.mini_batch_size = self.batch_size // 10 # quantidade de atualizações que ocorrerá na politica self.updates = 10 self.max_iterations = 16 # Soma das recompensas self.sum_rewards = 0 self.obs = [] # ! # Inicializa o modelo da rede neural self.model = FeedForwardNN(in_dim=1, out_dim=4, hidden_layer=16) self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4) # Array que contem a loss self.loss = [] def step(self, action): obs, reward, done, _ = self.env.step(action) obs = obs_to_torch(obs) obs = obs.unsqueeze( dim=0 ) # Transforma a entrada de tensor(x) tensor([x]), basicamente um tratamento para utilizar na rede neural return obs, reward, done def reset(self): obs = self.env.reset() obs = obs_to_torch(obs) obs = obs.unsqueeze( dim=0 ) # Transforma a entrada de tensor(x) tensor([x]), basicamente um tratamento para utilizar na rede neural self.sum_rewards = [] return obs def _calc_advantages(self, done: np.ndarray, rewards: np.ndarray, values: np.ndarray) -> np.ndarray: # Essa função calcula as vantagens # Voltar nessa função quando ela aparecer no código advantages = np.zeros((self.max_steps), dtype=np.float32) last_advantage = 0. _, last_value = self.model(obs_to_torch(self.obs)) last_value = last_value.cpu().data # Calcula de ordem reversa Não sei explicar o porque de fato for t in reversed(range(self.max_steps)): mask = 1.0 - done[t] last_value = last_value * mask last_advantage = last_advantage * mask delta = rewards[t] + self.gamma + last_value - values[t] last_advantage = delta + self.gamma * self.lamda * last_advantage advantages[t] = last_advantage last_value = values[t] return advantages def sample(self): # Contém as amostras de cada época rewards_array = np.zeros((self.max_steps), dtype=np.int32) actions_array = np.zeros((self.max_steps), dtype=np.int32) done_array = np.zeros((self.max_steps), dtype=np.bool) obs_array = np.zeros( (self.max_steps), dtype=np.float32 ) # 64 é o numero de posicoes que o agente pode navegar log_pis_array = np.zeros((self.max_steps), dtype=np.float32) # log da politica values_array = np.zeros((self.max_steps), dtype=np.float32) # value function self.obs = self.reset() # Armazenando a ultima observacao # Cada passo do treinamento count_iterations = 0 # Se ele passar um threshold de passos, reseta o ambiente deu_gg = 0 for t in range(self.max_steps): with torch.no_grad(): obs_array[t] = self.obs pi, v = self.model(self.obs) values_array[t] = v.cpu().numpy() a = pi.sample() actions_array[t] = a.cpu().numpy() action = int(a.cpu().numpy()) log_pis_array[t] = pi.log_prob(a).cpu().numpy() # Obtendo a informacoes do passo, dado a acao a. self.obs, new_reward, new_done = self.step(action) obs_array[t] = self.obs.numpy() #import pdb;breakpoint() rewards_array[t] = new_reward done_array[t] = new_done if new_done == True or count_iterations > self.max_iterations: self.obs = self.reset() count_iterations = 0 if new_reward == 1: deu_gg += 1 count_iterations += 1 import pdb breakpoint() # Calcula a vantagem(Generalized Advantage Estimator) advantage = self._calc_advantages(done_array, rewards_array, values_array) # Cria um dicionario que contem as informacoes de cada amostra samples = { 'obs': obs_array, 'actions': actions_array, 'values': values_array, 'log_pis': log_pis_array, 'advantage': advantage } # Cria um dicionario que contera as amostras samples_flat = {} for k, v in samples.items(): #import pdb; breakpoint() v = v.reshape(-1, 1) if k == 'obs': #samples_flat[k] = torch.unsqueeze(obs_to_torch(v),0) samples_flat[k] = (obs_to_torch(v)) else: samples_flat[k] = torch.tensor(v, device=device) return samples_flat def train(self, samples: Dict[str, torch.Tensor], learning_rate: float, clip_range: float): for _ in range(self.epochs): # Obtendo o index das amostras de maneira aleatória idx = torch.randperm(self.batch_size) # Está criando o loop que irá descrever o processo de atualização da politica # o mini_batch dita a quantidade de observacoes por batch for start in range(0, self.batch_size, self.mini_batch_size): end = start + self.mini_batch_size mini_batch_idx = idx[start:end] mini_batch = {} for k, v in samples.items(): mini_batch[k] = v[mini_batch_idx] loss = self._calc_loss(clip_range=clip_range, samples=mini_batch) self.loss.append(loss) for pg in self.optimizer.param_groups: pg['lr'] = learning_rate self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=0.5) self.optimizer.step() @staticmethod def _normalize(adv: torch.Tensor): """#### Normalize advantage function""" return (adv - adv.mean()) / ( adv.std() + 1e-8 ) # Normalizando, foi adicionado 1e-8 para garantir que não haja divisao por zero def _calc_loss(self, samples: Dict[str, torch.Tensor], clip_range: float) -> torch.Tensor: # sample['values'] and samples['advantage'] sao logs # queremos calcular pi_new / pi_old # como são logs, utilizamos a seguinte propriedade # log(a-b) = log a / log b sampled_return = samples['values'] + samples['advantage'] sampled_normalized_advantage = self._normalize( samples['advantage']) # normalization <- is it actually needed? ! pi, value = self.model( samples['obs'] ) # retreaving information about the model, the policy and the value-fuction log_pi = pi.log_prob( samples['actions']) # applying log to the probility ! ratio = torch.exp(log_pi - samples['log_pis']) # new_policy - old_policy clipped_ratio = ratio.clamp(min=1.0 - clip_range, max=1.0 + clip_range) # returning the clipped ratio policy_reward = torch.min(ratio * sampled_normalized_advantage, clipped_ratio * sampled_normalized_advantage ) # utilizing the formula on PPO article policy_reward = policy_reward.mean() # ! why the mean? entropy_bonus = pi.entropy() # ! entropy_bonus = entropy_bonus.mean() # ! # Value of the observation - clipped estimated value(of the neural network) clipped_value = samples['values'] + (value - samples['values']).clamp( min=-clip_range, max=clip_range) # value function loss, getting the maximun value vf_loss = torch.max((value - sampled_return)**2, (clipped_value - sampled_return)**2) # ! why the mean of 1/2 * mean of loss? vf_loss = 0.5 * vf_loss.mean() # multiplying for -1, thats why they get the maximun value on vf_loss 2 lines above loss = -(policy_reward - 0.5 * vf_loss + 0.01 * entropy_bonus) # kl_divergence approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi)**2).mean() # clip_fraction = (abs( (ratio - 1.0)) > clip_range).to(torch.float).mean() return loss def run_training_loop(self): for update in range(self.updates): progress = update / self.updates learning_rate = 2.5e-4 * (1 - progress) clip_range = 0.1 * (1 - progress) samples = self.sample() self.train(samples, learning_rate, clip_range) print('chegou ate aqui') def test_loop(self, number_it): for i in range(number_it): obs = obs_to_torch(self.env.reset()) done = False while done == False: env.render() pi, value = self.model(obs.reshape(1, -1)) action = pi.sample() obs, reward, done, _ = self.env.step(int(action)) obs = obs_to_torch(obs) time.sleep(1) if done == True: print("GG WP")