Esempio n. 1
0
    def sample_and_update(self):
        av_loss = 0
        # update old model
        self.network_old.load_state_dict(self.network.state_dict())
        for step in tqdm(range(self.policy_update_epochs)):
            # cf https://github.com/openai/baselines/blob/master/baselines/pposgd/pposgd_simple.py
            batch_states, batch_actions, batch_rewards, batch_states1, batch_terminals, extra = \
                self.replay_memory.sample_and_split(self.batch_size)


            batch_returns = to_tensor(extra[:, 0])
            batch_advantages = to_tensor(extra[:, 1])
            batch_states = to_tensor(batch_states)
            batch_actions = to_tensor(batch_actions)

            # old probas
            actions_old, v_pred_old = self.network_old(batch_states.detach())
            probs_old = self.network_old.log_prob(batch_actions)

            # new probabilities
            actions, v_pred = self.network(batch_states)
            probs = self.network.log_prob(batch_actions)

            # ratio
            ratio = probs / (1e-15 + probs_old)
            # clip loss
            surr1 = ratio * tor.stack([batch_advantages] * batch_actions.shape[1],
                                      1)  # surrogate from conservative policy iteration
            surr2 = ratio.clamp(1 - self.epsilon, 1 + self.epsilon) * tor.stack(
                [batch_advantages] * batch_actions.shape[1], 1)
            loss_clip = -tor.mean(tor.min(surr1, surr2))
            # value loss
            vfloss1 = (v_pred - batch_returns) ** 2
            #v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(-self.epsilon, self.epsilon)
            vfloss2 = (v_pred - batch_returns) ** 2
            loss_value = 0.5 * tor.mean(tor.max(vfloss1, vfloss2))  # also clip value loss
            # entropy
            loss_ent = -self.ent_coeff * tor.mean((np.e * probs)* probs)
            # total
            total_loss = (loss_clip + loss_value + loss_ent)
            av_loss += total_loss.data[0] / float(self.num_episodes)
            # step
            self.optimizer.zero_grad()
            # model.zero_grad()
            total_loss.backward()
            # print(list(self.network.parameters())[0].grad)
            self.optimizer.step()
        self.num_episodes = 0
        self.replay_memory.clear()
Esempio n. 2
0
    def gather_experience_and_calc_advantages(self):
        acc_reward = 0
        episode_length = 0
        for i in tqdm(range(self.memory_fill_steps)):
            self.adv_states[i] = self.state
            action, value = self.network(to_tensor(self.state).view(1,-1))
            self.adv_actions[i] = action.data.numpy()
            self.adv_values[i] = value.data.numpy()

            env_action = action.data.squeeze().numpy()
            state, reward, done, _ = self.env.step(env_action)
            episode_length += 1
            acc_reward += reward

            self.state = state
            done = done or episode_length >= self.max_episode_len
            self.adv_terminal[i] = done

            self._episode_step(state, env_action, reward, done)

            if done:
                self.env.reset()
                self.num_episodes += 1
                episode_length = 0

            #reward = max(min(reward, 1), -1)
            self.adv_rewards[i] = reward

            R = np.zeros((1,1))
            if not done:
                action, value = self.network(to_tensor(state).view(1,-1))
                R = value.data.numpy()
            self.adv_values[i] = R

        A = np.zeros((1,1))
        for j in reversed(range(i)):
            td = self.adv_rewards[j] + self.gamma * self.adv_values[j + 1] - self.adv_values[j]
            A = td + self.gamma * self.lmda * A
            self.adv_advantages[j] = A
            R = A + self.adv_values[j]
            self.adv_returns[j] = R

            self.replay_memory.append(self.adv_states[j], self.adv_actions[j], self.adv_rewards[j],
                                      self.adv_terminal[j], [self.adv_returns[j], self.adv_advantages[j]])
Esempio n. 3
0
    def value(self, *args, **kwargs):
        """
        To be called in non-batch call.
        :param args: List of inputs to the network
        :return: Output of the network in non-batch shape
        """

        if len(args) > 1:
            x = np.hstack(args)
            x = np.expand_dims(x, 0)
            x = to_tensor(x, **kwargs)
            out = self.critic_forward(x)
            self.out = out[0]
            return out[0]
        else:
            x = args[0]
            x = np.expand_dims(x, 0)
            x = to_tensor(x, **kwargs)
            out = self.critic_forward(x)
            self.out = out[0]
            return out[0]
Esempio n. 4
0
 def actions(self, *args, **kwargs):
     """
     To be called in a batch call.
     :param args: List of inputs to the network
     :return: Output of the network in batch shape
     """
     if len(args) > 1:
         if len(args) > 1:
             if not tor.is_tensor(args[0]):
                 x = np.hstack(args)
                 x = to_tensor(x, **kwargs)
             else:
                 x = tor.cat(args, 1)
         out = self.forward(x)
         self.out = out
         return out
     else:
         x = args[0]
         x = to_tensor(x, **kwargs)
         out = self.forward(x)
         self.out = out
         return out
Esempio n. 5
0
    def forward(self, x):
        if self.activation_functions:
            for i, func in enumerate(self.activation_functions):
                x = func(self.layer_list[i](x))
        else:
            for i, layer in enumerate(self.layer_list[:-1]):
                x = self.relu(layer(x))

        x = self.layer_list[-1](x)

        self.means = self.tanh(x[:, :int((x.shape[1] - 1))])
        # Choose sigmas randomly
        self.sigmas = to_tensor(
            np.full((x.shape[0], x.shape[1] - 1), np.e**self.sigma_log))
        self.dist = Normal(self.means, self.sigmas)
        self.value = x[:, -1]
        self.sampled = self.dist.rsample()
        x = self.sampled
        self.out = x
        return x
Esempio n. 6
0
    def policy_forward(self, x):

        # Policy network

        if self.activation_functions:
            for i, func in enumerate(self.activation_functions):
                x = func(self.layer_list[i](x))
        else:
            for i, layer in enumerate(self.layer_list[:-1]):
                x = self.relu(layer(x))

        x = self.layer_list[-1](x)

        self.means = self.tanh(x)
        self.sigmas = to_tensor(
            np.full((x.shape[0], x.shape[1]), np.e**self.sigma_log))
        self.dist = Normal(self.means, self.sigmas)

        self.sampled = self.dist.rsample()
        x = self.sampled

        return x
Esempio n. 7
0
    def gather_experience_and_calc_advantages(self, network, q, reward_q, max_episode_len, gamma, lmda, env, step, pid, T, sigma_log):
        np.random.seed(pid)
        tor.manual_seed(pid)
        env.seed(pid)
        state = env.reset()
        episode_length = 0
        acc_reward = 0
        adv_states = np.zeros((T, env.observation_space.shape[0]))
        adv_actions = np.zeros((T, env.action_space.shape[0]))
        adv_rewards = np.zeros(T)
        adv_values = np.zeros(T)
        adv_returns = np.zeros(T)
        adv_advantages = np.zeros(T)
        while True:

            counter = 0
            for i in range(T):
                counter += 1
                adv_states[i] = state
                action, value = network(to_tensor(state).view(1,-1), sigma_log)
                adv_actions[i] = action.data.numpy()
                adv_values[i] = value.data.numpy()

                env_action = action.data.squeeze().numpy()
                state, reward, done, _ = env.step(np.clip(env_action, -1, 1))
                # reward /= 100.
                # print(reward)

                self._async_step(state=state, reward=reward)

                episode_length += 1
                acc_reward += reward
                done = done or episode_length >= max_episode_len

                # step(state, env_action, reward, done)

                #reward = max(min(reward, 1), -1)
                adv_rewards[i] = reward

                R = np.zeros((1,1))
                if not done:
                    R = value.data.numpy()
                adv_values[i] = R

                if done:
                    state = env.reset()
                    episode_length = 0
                    reward_q.put(acc_reward)
                    self._async_episode_step(acc_reward=acc_reward)
                    #print("Acc reward in episode: ", acc_reward)
                    acc_reward = 0
                    break


            if done:
                continue

            A = np.zeros((1, 1))
            for j in reversed(range(counter-1)):
                td = adv_rewards[j] + gamma * adv_values[j + 1] - adv_values[j]
                A = td + gamma * lmda * A
                adv_advantages[j] = A
                R = A + adv_values[j]
                adv_returns[j] = R
                q.put([adv_states[j], adv_actions[j], adv_rewards[j],
                                          False, [adv_returns[j], adv_advantages[j]]])
    goal_occurances[tuple(env.goal)] = 1
    state_and_goal = np.zeros((1, num_bits * 2))

    for j in range(episode_length):

        # Normal step
        state = env.get_observation()
        goal = env.goal

        hgoal = tuple(state)
        goal_occurances[hgoal] = goal_occurances[
            hgoal] + 1 if hgoal in goal_occurances else 1
        state_and_goal[0][0:num_bits] = state
        state_and_goal[0][num_bits::] = goal

        x = to_tensor(state_and_goal, 0)

        action_distribution = policy.forward(x)
        action = policy.sample_action()

        episode_steps[j] = (state, action)

        state, reward, done, _ = env.step(action)

        acc_reward += reward

    mvr_tracker.append(acc_reward)

    if i % 200 == 0:
        print(i, ". Moving Average Reward:", np.mean(mvr_tracker),
              "Acc reward:", acc_reward)