Example #1
0
def test(env, actor_model):
    """
            Tests the model.

            Parameters:
                    env - the environment to test the policy on
                    actor_model - the actor model to load in

            Return:
                    None
    """
    print(f"Testing {actor_model}", flush=True)

    # If the actor model is not specified, then exit
    if actor_model == '':
        print(f"Didn't specify model file. Exiting.", flush=True)
        sys.exit(0)

    # Extract out dimensions of observation and action spaces
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]

    # Build our policy the same way we build our actor model in PPO
    policy = FeedForwardNN(obs_dim, act_dim)

    # Load in the actor model saved by the PPO algorithm
    policy.load_state_dict(torch.load(actor_model))

    # Evaluate our policy with a separate module, eval_policy, to demonstrate
    # that once we are done training the model/policy with ppo.py, we no longer need
    # ppo.py since it only contains the training algorithm. The model/policy itself exists
    # independently as a binary file that can be loaded in with torch.
    eval_policy(policy=policy, env=env, render=True)
Example #2
0
    def __init__(self, env, max_steps=1000):

        self.env = env

        #Hyper Parametros
        self.max_steps = max_steps
        # Fator de desconto
        self.gamma = 0.99
        self.lamda = 0.95

        # Quantidade de épocas
        self.epochs = 10
        # Quantidade de passos que tera uma batch
        self.batch_size = self.max_steps
        # Tamanho de uma mini batch !
        self.mini_batch_size = self.batch_size // 10
        # quantidade de atualizações que ocorrerá na politica
        self.updates = 10

        self.max_iterations = 16

        # Soma das recompensas
        self.sum_rewards = 0
        self.obs = []  # !

        # Inicializa o modelo da rede neural
        self.model = FeedForwardNN(in_dim=1, out_dim=4, hidden_layer=16)

        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)

        # Array que contem a loss
        self.loss = []
Example #3
0
    def __init__(self, env):
        # ハイパーパラメータの初期化
        self._init_hyperparameters()

        # 環境の情報の抽出
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]

        # ALG STEP 1
        # ActorとCriticのネットワークの初期化
        self.actor = FeedForwardNN(self.obs_dim, self.act_dim)
        self.critic = FeedForwardNN(self.obs_dim, 1)

        # Optimizerの初期化
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.actor_critic = Adam(self.critic.parameters(), lr=self.lr)

        # 共分散行列の作成
        self.cov_var = torch.full(size=(self.act_dim, ), fill_value=0.5)
        self.cov_mat = torch.diag(self.cov_var)
Example #4
0
def eval_progress(list_dir,
                  file_dir,
                  actor_model,
                  env,
                  obs_dim,
                  act_dim,
                  render=False):
    for entry in list_dir:
        ep_num = entry
        actor_model = file_dir + str(entry) + "/" + actor_model
        policy = FeedForwardNN(obs_dim, act_dim)
        ep_len, ep_ret = rollout(policy, env, render)
        _log_summary(ep_len=ep_len, ep_ret=ep_ret, ep_num=ep_num)
    env.close()
Example #5
0
def test(env, datapath, actor_model, mode):
    print(f"Testing {actor_model}", flush=True)

    if actor_model == '':
        print(f"Didn't specify model file. Exiting.", flush=True)
        sys.exit(0)

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    entries = os.listdir(datapath)
    entries = [int(x) for x in entries]
    entries.sort()

    if mode == 'test':
        policy = FeedForwardNN(obs_dim, act_dim)
        actor_model = datapath + "/" + str(entries[-1]) + "/" + actor_model
        policy.load_state_dict(torch.load(actor_model))
        print("Iteration " + str(entries[-1]))
        eval_policy(policy=policy, env=env, render=True)

    if mode == 'progress':
        eval_progress(list_dir = entries, file_dir = datapath, actor_model = actor_model,\
          obs_dim = obs_dim, act_dim = act_dim, env = env, render = True)
        '''
Example #6
0
class PPO:
    def __init__(self, env):
        # ハイパーパラメータの初期化
        self._init_hyperparameters()

        # 環境の情報の抽出
        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.shape[0]

        # ALG STEP 1
        # ActorとCriticのネットワークの初期化
        self.actor = FeedForwardNN(self.obs_dim, self.act_dim)
        self.critic = FeedForwardNN(self.obs_dim, 1)

        # Optimizerの初期化
        self.actor_optim = Adam(self.actor.parameters(), lr=self.lr)
        self.actor_critic = Adam(self.critic.parameters(), lr=self.lr)

        # 共分散行列の作成
        self.cov_var = torch.full(size=(self.act_dim, ), fill_value=0.5)
        self.cov_mat = torch.diag(self.cov_var)

    def learn(self, total_timesteps):
        t_so_far = 0  # これまでにシミュレートされたタイムステップ
        while t_so_far < total_timesteps:  # ALG STEP 2
            # ALG STEP 3
            batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens = self.rollout(
            )

            # このバッチを収集したタイムステップ数を算出
            t_so_far += np.sum(batch_lens)

            # V_{phi_k}を算出
            V, _ = self.evaluate(batch_obs, batch_acts)

            # ALG STEP 5
            # advantageの算出と正規化
            A_k = batch_rtgs - V.detach()
            A_k = (A_k - A_k.mean()) / (A_k.std() + 1e-10)

            for _ in range(self.n_updates_per_iteration):
                # pi_theta(a_t | s_t)を計算
                V, curr_log_probs = self.evaluate(batch_obs, batch_acts)

                # 比率を計算する
                ratios = torch.exp(curr_log_probs - batch_log_probs)

                # 代理損失(サロゲート損失)を算出
                surr1 = ratios * A_k
                surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * A_k

                # 損失を算出
                actor_loss = (-torch.min(surr1, surr2)).mean()
                critic_loss = nn.MSELoss()(V, batch_rtgs)

                # 勾配を算出 & Actorネットワークのバックプロバゲーション
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()

                # 勾配を算出 & Criticネットワークのバックプロバゲーション
                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()

    def evaluate(self, batch_obs, batch_acts):
        # 各観測値のVについてcriticネットワークに問い合わせる
        V = self.critic(batch_obs).squeeze()
        # 最新のアクターネットワークを使用して対数確率を算出
        mean = self.actor(batch_obs)
        dist = MultivariateNormal(mean, self.cov_mat)
        log_probs = dist.log_prob(batch_acts)
        return V, log_probs

    def rollout(self):
        # バッチデータ
        batch_obs = []  # 観測値:(observations-バッチごとのタイムステップ数、観測値の次元)
        batch_acts = []  # アクション:(actions-バッチあたりのタイムステップ数、アクションの次元)
        batch_log_probs = []  # 対数確率:(log probabilities-バッチあたりのタイムステップ数)
        batch_rews = []  # 報酬:(rewards-エピソードの数、エピソードごとのタイムステップの数)
        batch_rtgs = []  # 報酬:(reward to go's-バッチあたりのタイムステップ数)
        batch_lens = []  # バッチの長さ:(batch length-エピソードの数)

        # このバッチでこれまでに実行されたタイムステップの数
        t = 0
        while t < self.timesteps_per_batch:
            ep_rews = []
            obs = self.env.reset()
            done = False
            for ep_t in range(self.max_timesteps_per_episode):
                # これまでのバッチを実行したタイムステップを増やす
                t += 1
                # 観測データの取得
                batch_obs.append(obs)
                action, log_prob = self.get_action(obs)
                obs, rew, done, _ = self.env.step(action)
                # 報酬、アクション、対数確率を取得
                ep_rews.append(rew)
                batch_acts.append(action)
                batch_log_probs.append(log_prob)
                # 終了判定
                if done:
                    break
            # エピソードの長さと報酬を取得
            batch_lens.append(ep_t + 1)
            batch_rews.append(ep_rews)

        # データを指定された形状のテンソルに再形成してから返す
        batch_obs = torch.tensor(batch_obs, dtype=torch.float)
        batch_acts = torch.tensor(batch_acts, dtype=torch.float)
        batch_log_probs = torch.tensor(batch_log_probs, dtype=torch.float)
        # ALG STEP 4
        batch_rtgs = self.compute_rtgs(batch_rews)
        # Return the batch data
        return batch_obs, batch_acts, batch_log_probs, batch_rtgs, batch_lens

    def get_action(self, obs):
        # 平均的なアクションのためのネットワーク(アクター)
        # self.actor.forward(obs)を呼び出すのと同じ
        mean = self.actor(obs)
        # 多変量正規分布を定義
        dist = MultivariateNormal(mean, self.cov_mat)
        # 分布からアクションをサンプリングして、その対数確率を取得する
        action = dist.sample()
        log_prob = dist.log_prob(action)
        # サンプリングされたアクションと、そのアクションのlog probを返す
        return action.detach().numpy, log_prob.detach()

    def compute_rtgs(self, batch_rews):
        # バッチごと・エピソードごとのリワード(rtg)を返す
        batch_rtgs = []
        # 各エピソードを逆方向に反復し、batch_rtgsで同じ順序を維持
        for ep_rews in reversed(batch_rews):
            discounted_reward = 0  # これまでの割引報酬
            for rew in reversed(ep_rews):
                discounted_reward = rew + discounted_reward * self.gamma
                batch_rtgs.insert(0, discounted_reward)
        # rewards-to-goをテンソルに変換
        batch_rtgs = torch.tensor(batch_rtgs, dtype=torch.float)
        return batch_rtgs

    def _init_hyperparameters(self):
        self.timesteps_per_batch = 4800  # バッチあたりのタイムステップ
        self.max_timesteps_per_episode = 1600  # エピソードあたりのタイムステップ
        self.gamma = 0.95  # 割引率
        self.n_updates_per_iteration = 5  # イテレーションあたりのエポック
        self.clip = 0.2  # クリップのしきい値
        self.lr = 0.005  # 学習率
Example #7
0
class PPO:
    def __init__(self, env, max_steps=1000):

        self.env = env

        #Hyper Parametros
        self.max_steps = max_steps
        # Fator de desconto
        self.gamma = 0.99
        self.lamda = 0.95

        # Quantidade de épocas
        self.epochs = 10
        # Quantidade de passos que tera uma batch
        self.batch_size = self.max_steps
        # Tamanho de uma mini batch !
        self.mini_batch_size = self.batch_size // 10
        # quantidade de atualizações que ocorrerá na politica
        self.updates = 10

        self.max_iterations = 16

        # Soma das recompensas
        self.sum_rewards = 0
        self.obs = []  # !

        # Inicializa o modelo da rede neural
        self.model = FeedForwardNN(in_dim=1, out_dim=4, hidden_layer=16)

        self.optimizer = optim.Adam(self.model.parameters(), lr=2.5e-4)

        # Array que contem a loss
        self.loss = []

    def step(self, action):

        obs, reward, done, _ = self.env.step(action)
        obs = obs_to_torch(obs)
        obs = obs.unsqueeze(
            dim=0
        )  # Transforma a entrada de tensor(x) tensor([x]), basicamente um tratamento para utilizar na rede neural
        return obs, reward, done

    def reset(self):
        obs = self.env.reset()
        obs = obs_to_torch(obs)
        obs = obs.unsqueeze(
            dim=0
        )  # Transforma a entrada de tensor(x) tensor([x]), basicamente um tratamento para utilizar na rede neural
        self.sum_rewards = []
        return obs

    def _calc_advantages(self, done: np.ndarray, rewards: np.ndarray,
                         values: np.ndarray) -> np.ndarray:
        # Essa função calcula as vantagens
        # Voltar nessa função quando ela aparecer no código
        advantages = np.zeros((self.max_steps), dtype=np.float32)
        last_advantage = 0.

        _, last_value = self.model(obs_to_torch(self.obs))
        last_value = last_value.cpu().data

        # Calcula de ordem reversa Não sei explicar o porque de fato
        for t in reversed(range(self.max_steps)):
            mask = 1.0 - done[t]
            last_value = last_value * mask
            last_advantage = last_advantage * mask

            delta = rewards[t] + self.gamma + last_value - values[t]

            last_advantage = delta + self.gamma * self.lamda * last_advantage

            advantages[t] = last_advantage

            last_value = values[t]

        return advantages

    def sample(self):
        # Contém as amostras de cada época
        rewards_array = np.zeros((self.max_steps), dtype=np.int32)
        actions_array = np.zeros((self.max_steps), dtype=np.int32)
        done_array = np.zeros((self.max_steps), dtype=np.bool)
        obs_array = np.zeros(
            (self.max_steps), dtype=np.float32
        )  # 64 é o numero de posicoes que o agente pode navegar
        log_pis_array = np.zeros((self.max_steps),
                                 dtype=np.float32)  # log da politica
        values_array = np.zeros((self.max_steps),
                                dtype=np.float32)  # value function

        self.obs = self.reset()  # Armazenando a ultima observacao
        # Cada passo do treinamento
        count_iterations = 0  # Se ele passar um threshold de passos, reseta o ambiente
        deu_gg = 0
        for t in range(self.max_steps):
            with torch.no_grad():
                obs_array[t] = self.obs

                pi, v = self.model(self.obs)

                values_array[t] = v.cpu().numpy()
                a = pi.sample()
                actions_array[t] = a.cpu().numpy()
                action = int(a.cpu().numpy())
                log_pis_array[t] = pi.log_prob(a).cpu().numpy()
                # Obtendo a informacoes do passo, dado a acao a.
                self.obs, new_reward, new_done = self.step(action)

                obs_array[t] = self.obs.numpy()
                #import pdb;breakpoint()
                rewards_array[t] = new_reward
                done_array[t] = new_done

                if new_done == True or count_iterations > self.max_iterations:
                    self.obs = self.reset()
                    count_iterations = 0
                    if new_reward == 1:
                        deu_gg += 1
                count_iterations += 1

        import pdb
        breakpoint()

        # Calcula a vantagem(Generalized Advantage Estimator)
        advantage = self._calc_advantages(done_array, rewards_array,
                                          values_array)

        # Cria um dicionario que contem as informacoes de cada amostra
        samples = {
            'obs': obs_array,
            'actions': actions_array,
            'values': values_array,
            'log_pis': log_pis_array,
            'advantage': advantage
        }

        # Cria um dicionario que contera as amostras
        samples_flat = {}
        for k, v in samples.items():
            #import pdb; breakpoint()
            v = v.reshape(-1, 1)
            if k == 'obs':
                #samples_flat[k] = torch.unsqueeze(obs_to_torch(v),0)
                samples_flat[k] = (obs_to_torch(v))
            else:
                samples_flat[k] = torch.tensor(v, device=device)

        return samples_flat

    def train(self, samples: Dict[str, torch.Tensor], learning_rate: float,
              clip_range: float):
        for _ in range(self.epochs):
            # Obtendo o index das amostras de maneira aleatória
            idx = torch.randperm(self.batch_size)
            # Está criando o loop que irá descrever o processo de atualização da politica
            # o mini_batch dita a quantidade de observacoes por batch
            for start in range(0, self.batch_size, self.mini_batch_size):
                end = start + self.mini_batch_size
                mini_batch_idx = idx[start:end]
                mini_batch = {}

                for k, v in samples.items():
                    mini_batch[k] = v[mini_batch_idx]

                loss = self._calc_loss(clip_range=clip_range,
                                       samples=mini_batch)

                self.loss.append(loss)

                for pg in self.optimizer.param_groups:
                    pg['lr'] = learning_rate

                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               max_norm=0.5)
                self.optimizer.step()

    @staticmethod
    def _normalize(adv: torch.Tensor):
        """#### Normalize advantage function"""
        return (adv - adv.mean()) / (
            adv.std() + 1e-8
        )  # Normalizando, foi adicionado 1e-8 para garantir que não haja divisao por zero

    def _calc_loss(self, samples: Dict[str, torch.Tensor],
                   clip_range: float) -> torch.Tensor:
        # sample['values'] and samples['advantage'] sao logs
        # queremos calcular pi_new / pi_old
        # como são logs, utilizamos a seguinte propriedade
        # log(a-b) = log a / log b
        sampled_return = samples['values'] + samples['advantage']

        sampled_normalized_advantage = self._normalize(
            samples['advantage'])  # normalization <- is it actually needed? !

        pi, value = self.model(
            samples['obs']
        )  # retreaving information about the model, the policy and the value-fuction

        log_pi = pi.log_prob(
            samples['actions'])  # applying log to the probility !

        ratio = torch.exp(log_pi -
                          samples['log_pis'])  # new_policy - old_policy

        clipped_ratio = ratio.clamp(min=1.0 - clip_range, max=1.0 +
                                    clip_range)  # returning the clipped ratio

        policy_reward = torch.min(ratio * sampled_normalized_advantage,
                                  clipped_ratio * sampled_normalized_advantage
                                  )  # utilizing the formula on PPO article
        policy_reward = policy_reward.mean()  # ! why the mean?

        entropy_bonus = pi.entropy()  # !
        entropy_bonus = entropy_bonus.mean()  # !

        # Value of the observation - clipped estimated value(of the neural network)
        clipped_value = samples['values'] + (value - samples['values']).clamp(
            min=-clip_range, max=clip_range)

        # value function loss, getting the maximun value
        vf_loss = torch.max((value - sampled_return)**2,
                            (clipped_value - sampled_return)**2)

        # ! why the mean of 1/2 * mean of loss?
        vf_loss = 0.5 * vf_loss.mean()

        #   multiplying for -1, thats why they get the maximun value on vf_loss 2 lines above
        loss = -(policy_reward - 0.5 * vf_loss + 0.01 * entropy_bonus)

        # kl_divergence
        approx_kl_divergence = .5 * ((samples['log_pis'] - log_pi)**2).mean()

        #
        clip_fraction = (abs(
            (ratio - 1.0)) > clip_range).to(torch.float).mean()

        return loss

    def run_training_loop(self):
        for update in range(self.updates):
            progress = update / self.updates
            learning_rate = 2.5e-4 * (1 - progress)
            clip_range = 0.1 * (1 - progress)

            samples = self.sample()

            self.train(samples, learning_rate, clip_range)
            print('chegou ate aqui')

    def test_loop(self, number_it):
        for i in range(number_it):
            obs = obs_to_torch(self.env.reset())
            done = False
            while done == False:
                env.render()
                pi, value = self.model(obs.reshape(1, -1))
                action = pi.sample()
                obs, reward, done, _ = self.env.step(int(action))
                obs = obs_to_torch(obs)
                time.sleep(1)
                if done == True:
                    print("GG WP")