Exemple #1
0
    def update_policy(self, dataset, epoch=1):
        """ Update policy

        Args:
            paths: a list of trajectories. Each contain a list of symbolic log_prob and rewards

        Returns:

        """
        actions, advantage, observation, rewards, hidden, mask = dataset

        observation = convert_numpy_to_tensor(observation)
        actions = convert_numpy_to_tensor(actions)
        advantage = convert_numpy_to_tensor(advantage)
        rewards = convert_numpy_to_tensor(rewards)

        for _ in range(epoch):
            # update policy network
            self.policy_optimizer.zero_grad()
            # compute log prob, assume observation is small.
            if not self.recurrent:
                distribution, _, raw_baselines = self.policy_net.forward(
                    observation, None)
                log_prob = distribution.log_prob(actions)
            else:
                log_prob = []
                raw_baselines = []
                zero_index = np.where(mask == 0)[0] + 1
                zero_index = zero_index.tolist()
                zero_index.insert(0, 0)
                for i in range(len(zero_index) - 1):
                    start_index = zero_index[i]
                    end_index = zero_index[i + 1]
                    current_obs = observation[start_index:end_index]
                    current_actions = actions[start_index:end_index]
                    current_hidden = convert_numpy_to_tensor(
                        np.expand_dims(self.init_hidden_unit, axis=0))
                    current_dist, _, current_baseline = self.policy_net.forward(
                        current_obs, current_hidden)
                    log_prob.append(current_dist.log_prob(current_actions))
                    raw_baselines.append(current_baseline)

                log_prob = torch.cat(log_prob, dim=0)
                raw_baselines = torch.cat(raw_baselines, dim=0)

            assert log_prob.shape == advantage.shape, 'log_prob length {}, advantage length {}'.format(
                log_prob.shape, advantage.shape)

            action_loss = torch.mean(-log_prob * advantage)
            loss = action_loss

            if self.nn_baseline:
                value_loss = self.get_baseline_loss(raw_baselines, rewards)
                loss = loss + value_loss * self.value_coef

            nn.utils.clip_grad_norm_(self.policy_net.parameters(),
                                     self.max_grad_norm)

            loss.backward()
            self.policy_optimizer.step()
 def predict_next_state(self, state, action):
     states = np.expand_dims(state, axis=0)
     actions = np.expand_dims(action, axis=0)
     states = convert_numpy_to_tensor(states)
     actions = convert_numpy_to_tensor(actions)
     with torch.no_grad():
         next_state = self.predict_next_states(states,
                                               actions).cpu().numpy()[0]
     return next_state
Exemple #3
0
    def predict(self, history_state, history_actions, current_state):
        """

        Args:
            history_state: (T - 1, 6)
            history_actions: (T - 1, 4)
            current_state: (6,)

        Returns: best action (4,)

        """
        states = np.expand_dims(history_state, axis=0)  # (1, T - 1, 6)
        states = np.tile(
            states, (self.num_random_action_selection, 1, 1))  # (N, T - 1, 6)
        states = convert_numpy_to_tensor(states)

        next_states = np.expand_dims(current_state, axis=0)  # (1, 6)
        next_states = np.tile(next_states,
                              (self.num_random_action_selection, 1))  # (N, 6)
        next_states = convert_numpy_to_tensor(next_states)

        actions = self.action_sampler.sample(
            (self.horizon, self.num_random_action_selection))  # (H, N, 4)
        actions = convert_numpy_to_tensor(actions)

        history_actions = np.expand_dims(history_actions,
                                         axis=0)  # (1, T - 1, 4)
        current_action = np.tile(
            history_actions,
            (self.num_random_action_selection, 1, 1))  # (N, T - 1, 4)
        current_action = convert_numpy_to_tensor(current_action)

        with torch.no_grad():
            cost = torch.zeros(
                size=(self.num_random_action_selection, )).type(FloatTensor)
            for i in range(self.horizon):
                states = torch.cat(
                    (states, torch.unsqueeze(next_states, dim=1)),
                    dim=1)  # # (N, T, 6)
                current_action = torch.cat(
                    (current_action, torch.unsqueeze(actions[i], dim=1)),
                    dim=1)  # (N, T, 4)
                next_states = self.model.predict_next_states(
                    states, current_action)  # (N, 6)
                cost += self.cost_fn(states[:, -1, :], actions[i],
                                     next_states) * self.gamma_inverse
                current_action = current_action[:, 1:, :]  # (N, T - 1, 4)
                states = states[:, 1:, :]

            best_action = actions[0, torch.argmin(cost, dim=0)]
            best_action = best_action.cpu().numpy()
            return best_action
Exemple #4
0
 def predict(self, history_state, history_action, state):
     state = np.expand_dims(state, axis=0)
     history_state = np.expand_dims(history_state, axis=0)
     history_action = np.expand_dims(history_action, axis=0)
     with torch.no_grad():
         state = convert_numpy_to_tensor(state)
         history_state = convert_numpy_to_tensor(history_state)
         history_action = convert_numpy_to_tensor(history_action)
         state = (state - self.state_mean.squeeze(dim=1)
                  ) / self.state_std.squeeze(dim=1)
         history_state = (history_state - self.state_mean) / self.state_std
         action = self.model.forward(history_state, history_action, state)
     return action.cpu().numpy()[0]
Exemple #5
0
 def predict(self, state):
     state = np.expand_dims(state, axis=0)
     with torch.no_grad():
         state = convert_numpy_to_tensor(state)
         state = (state - self.state_mean) / self.state_std
         action = self.model.forward(state)
     return action.cpu().numpy()[0]
Exemple #6
0
    def __init__(self, env, temperature_center):
        super(EnergyPlusObsWrapper, self).__init__(env=env)
        self.obs_mean = np.array([
            temperature_center, temperature_center, temperature_center, 1e5,
            5000.
        ],
                                 dtype=np.float32)
        self.obs_max = np.array([30., 30., 30., 1e5, 1e4], dtype=np.float32)
        self.obs_mean_tensor = convert_numpy_to_tensor(
            self.obs_mean).unsqueeze(dim=0)
        self.obs_max_tensor = convert_numpy_to_tensor(
            self.obs_max).unsqueeze(dim=0)

        self.observation_space = spaces.Box(
            low=np.array([-1., -1., -1., -10., -10.]),
            high=np.array([1., 1., 1., 10.0, 10.0]),
            dtype=np.float32)
Exemple #7
0
    def predict(self, state):
        states = np.expand_dims(state, axis=0)
        actions = self.action_sampler.sample(
            (self.horizon, self.num_random_action_selection))
        states = np.tile(states, (self.num_random_action_selection, 1))
        states = convert_numpy_to_tensor(states)
        actions = convert_numpy_to_tensor(actions)

        with torch.no_grad():
            cost = torch.zeros(
                size=(self.num_random_action_selection, )).type(FloatTensor)
            for i in range(self.horizon):
                next_states = self.model.predict_next_states(
                    states, actions[i])
                cost += self.cost_fn(states, actions[i],
                                     next_states) * self.gamma_inverse
                states = next_states

            best_action = actions[0, torch.argmin(cost, dim=0)]
            best_action = best_action.cpu().numpy()
            return best_action
 def set_statistics(self, dataset):
     self.state_mean = convert_numpy_to_tensor(
         dataset.state_mean).unsqueeze(dim=0)
     self.state_std = convert_numpy_to_tensor(
         dataset.state_std).unsqueeze(dim=0)
     if self.dynamics_model.discrete:
         self.action_mean = None
         self.action_std = None
     else:
         self.action_mean = convert_numpy_to_tensor(
             dataset.action_mean).unsqueeze(dim=0)
         self.action_std = convert_numpy_to_tensor(
             dataset.action_std).unsqueeze(dim=0)
     self.delta_state_mean = convert_numpy_to_tensor(
         dataset.delta_state_mean).unsqueeze(dim=0)
     self.delta_state_std = convert_numpy_to_tensor(
         dataset.delta_state_std).unsqueeze(dim=0)
Exemple #9
0
    def predict(self, state):
        """ The model must be in evaluation mode and turn off gradient update

        Args:
            state: (ob_dim)

        Returns: optimal action (ac_dim)

        """
        action_module = TanhActionModule(
            init_action=self.action_sampler.sample((self.horizon, )))
        optimizer = torch.optim.Adam(action_module.parameters(), lr=1e-3)
        # t = tqdm(range(self.num_iterations), desc='Planning')
        t = range(self.num_iterations)
        for iteration in t:
            optimizer.zero_grad()
            cost = []
            current_state = convert_numpy_to_tensor(
                np.expand_dims(state, axis=0))
            for h in range(self.horizon):
                current_action = action_module.forward(h)
                next_states = self.model.predict_next_states(
                    current_state, current_action)
                cost.append(
                    self.cost_fn(current_state, current_action, next_states) *
                    self.gamma_inverse)
                current_state = next_states
            cost = torch.mean(torch.cat(cost))
            cost.backward()

            nn.utils.clip_grad_norm_(action_module.parameters(), max_norm=1.0)

            optimizer.step()

            # t.set_description('Iter {}/{}, Cost {:.4f}'.format(iteration + 1, self.num_iterations, cost.item()))

        return action_module.forward(0)[0].cpu().detach().numpy()
Exemple #10
0
 def __init__(self, init_action):
     super(TanhActionModule, self).__init__()
     init_action = convert_numpy_to_tensor(init_action)
     self.action = nn.Parameter(data=init_action, requires_grad=True)
Exemple #11
0
 def set_state_stats(self, state_mean, state_std):
     self.state_mean = convert_numpy_to_tensor(state_mean).unsqueeze(dim=0)
     self.state_std = convert_numpy_to_tensor(state_std).unsqueeze(dim=0)
Exemple #12
0
def compute_reward_to_go_gae(paths, gamma, policy_net, lam, value_mean,
                             value_std):
    rewards = []
    gaes = []
    for path in paths:
        # compute last state value
        if path['mask'][-1] == 1:
            with torch.no_grad():
                last_obs = convert_numpy_to_tensor(
                    np.expand_dims(path['last_obs'], axis=0)).type(FloatTensor)
                last_hidden = convert_numpy_to_tensor(
                    np.expand_dims(path['last_hidden'],
                                   axis=0)).type(FloatTensor)
                last_state_value = policy_net.forward(
                    last_obs, last_hidden)[-1].cpu().numpy()[0]
                last_state_value = last_state_value * value_std + value_mean
        else:
            last_state_value = 0.

        # we need to clip last_state_value by (max_abs_value / (1 - gamma))
        # Otherwise, large state value would cause positive feedback loop and cause the reward to explode.
        max_abs_value = np.max(np.abs(path['reward']))
        last_state_value = np.clip(last_state_value,
                                   a_min=-max_abs_value / (1 - gamma),
                                   a_max=max_abs_value / (1 - gamma))

        # calculate reward-to-go
        path['reward'].append(last_state_value)
        current_rewards = discount(path['reward'], gamma).astype(np.float32)

        rewards.append(current_rewards[:-1])

        # compute gae
        with torch.no_grad():
            observation = path['observation']
            hidden = path['hidden']
            data_loader = create_data_loader((observation, hidden),
                                             batch_size=32,
                                             shuffle=False,
                                             drop_last=False)
            values = []
            for obs, hid in data_loader:
                obs = move_tensor_to_gpu(obs)
                hid = move_tensor_to_gpu(hid)
                values.append(policy_net.forward(obs, hid)[-1])
            values = torch.cat(values, dim=0).cpu().numpy()
            values = values * value_std + value_mean
            values = np.append(values, last_state_value)

        # add the value of last obs for truncated trajectory
        temporal_difference = path[
            'reward'][:-1] + values[1:] * gamma - values[:-1]
        # calculate reward-to-go
        gae = discount(temporal_difference, gamma * lam).astype(np.float32)
        gaes.append(gae)

    rewards = np.concatenate(rewards)
    new_values_mean, new_values_std = np.mean(rewards), np.std(rewards)
    rewards = (rewards - new_values_mean) / (new_values_std + eps)

    gaes = np.concatenate(gaes)
    gaes = (gaes - np.mean(gaes)) / (np.std(gaes) + eps)

    return rewards, gaes, new_values_mean, new_values_std