Exemple #1
0
    def _restore_one_channel(self, sp, data, start_index):
        """Extract the data of one channel from the flattened data.

        Arguments:
            sp (obj): basic `gym.spaces.space` object.
            data (obj): a numpy array of flattened observations.
            start_index (int): indicating the starting index of this channel.
        Returns:
            selected_data (obj): a numpy array of this channel's data.
            dim (int): the dimensionality of this channel's data.
        """

        if isinstance(sp, gym.spaces.Box):
            dtype = np.float32
            dim = prod(sp.shape)
        elif isinstance(sp, gym.spaces.Discrete):
            dtype = np.int32
            dim = sp.n
        elif isinstance(sp, gym.spaces.MultiDiscrete):
            dtype = np.int32
            dim = prod(sp.shape)
        elif isinstance(sp, gym.spaces.MultiBinary):
            dtype = np.int32
            dim = prod(sp.shape)

        selected_data = np.asarray(data[:, start_index:start_index +
                                        dim]).astype(dtype)

        return selected_data, dim
Exemple #2
0
def ips_eval_old(batch_weights, batch_rewards, gamma):
    assert len(batch_weights) == len(
        batch_rewards), "length of weights must be same with length of rewards"
    all_weights = []
    all_weights_stepwise = []
    batch_ips_scores = []
    batch_ips_scores_stepwise = []
    for weights, rewards in zip(batch_weights, batch_rewards):
        w_init = prod(weights)
        all_weights.append(w_init)
        w_init_stepwise = 1.0

        w_steps = np.ones_like(weights)
        w_cum_tmp = 1.0
        w_steps_stepwise = []
        for w in weights:
            w_cum_tmp *= w
            w_steps_stepwise.append(w_cum_tmp)
        all_weights_stepwise.append(w_cum_tmp)

        ips_score = [
            w_init * w_step * (gamma**idx) * r
            for idx, (w_step, r) in enumerate(zip(w_steps, rewards))
        ]
        ips_score_stepwise = [
            w_init_stepwise * w_step * (gamma**idx) * r
            for idx, (w_step, r) in enumerate(zip(w_steps_stepwise, rewards))
        ]

        batch_ips_scores.append(ips_score)
        batch_ips_scores_stepwise.append(ips_score_stepwise)

    avg_weights = np.mean(all_weights)
    avg_weights_stepwise = np.mean(all_weights_stepwise)

    output_per_traj = [np.sum(ips_score) for ips_score in batch_ips_scores]
    output_per_traj_stepwise = [
        np.sum(ips_score) for ips_score in batch_ips_scores_stepwise
    ]
    output_norm_per_traj = [
        np.sum(ips_score) / avg_weights for ips_score in batch_ips_scores
    ]
    output_norm_per_traj_stepwise = [
        np.sum(ips_score) / avg_weights_stepwise
        for ips_score in batch_ips_scores_stepwise
    ]

    ips = np.mean(output_per_traj)
    ips_stepwise = np.mean(output_per_traj_stepwise)
    wips = np.mean(output_norm_per_traj)
    wips_stepwise = np.mean(output_norm_per_traj_stepwise)

    return ips, ips_stepwise, wips, wips_stepwise
Exemple #3
0
    def _basic_space_to_ph_spec(self, sp):
        """Translate a gym space object to a tuple to specify data type and shape.
        Arguments:
            sp (obj): basic space object of gym interface.
        Returns:
            a tuple used for building TensorFlow placeholders where the first element specifies `dtype` and the second one specifies `shape`.
        """

        # (jones.wz) TO DO: handle gym Atari input
        if isinstance(sp, gym.spaces.Box):
            if len(sp.shape) == 3:
                return (tf.uint8, (None, ) + sp.shape)
            return (tf.float32, (None, prod(sp.shape)))
        elif isinstance(sp, gym.spaces.Discrete):
            return (tf.int32, (None, sp.n))
        elif isinstance(sp, gym.spaces.MultiDiscrete):
            return (tf.int32, (None, prod(sp.shape)))
        elif isinstance(sp, gym.spaces.MultiBinary):
            return (tf.int32, (None, prod(sp.shape)))
        else:
            raise TypeError(
                "specified an unsupported space type {}".format(sp))
Exemple #4
0
    def _prepare_ph_spec(self):
        """Build the TensorFlow placeholders according to the `observation_space`.

        Forbid multi-channel observations where any individual channel is recursively defined as a multi-channel observation.
        `_prepare_ph_spec()` can easily handle recursively defined observations, but they introduce unnecessary complexity to the TensorFlow FIFOqueues used for data exchanging in distributed setting.
        """

        if isinstance(self.observation_space, gym.spaces.Tuple):
            self._is_single_channel = False
            self.ob_ph_spec = list()
            for sp in self.observation_space.spaces:
                assert type(sp) not in [
                    gym.spaces.Tuple, gym.spaces.Dict
                ], "forbidden type {}".format(self.observation_space)
                self.ob_ph_spec.append(self._basic_space_to_ph_spec(sp))
            self.flattened_ob_shape = (np.sum(
                [s[1][1] for s in self.ob_ph_spec]), )
        elif isinstance(self.observation_space, gym.spaces.Dict):
            self._is_single_channel = False
            self.ob_ph_spec = OrderedDict()
            for sp_name, sp in self.observation_space.spaces.items():
                assert type(sp) not in [
                    gym.spaces.Tuple, gym.spaces.Dict
                ], "forbidden type {}".format(self.observation_space)
                self.ob_ph_spec[sp_name] = self._basic_space_to_ph_spec(sp)
            self.flattened_ob_shape = (np.sum(
                [s[1][1] for s in self.ob_ph_spec.values()]), )
        else:
            self._is_single_channel = True
            self.ob_ph_spec = self._basic_space_to_ph_spec(
                self.observation_space)
            self.flattened_ob_shape = self.ob_ph_spec[1][1:]

        if isinstance(self.action_space, gym.spaces.Discrete):
            self.action_ph_spec = (self.action_space.n, "Categorical")
        elif isinstance(self.action_space, gym.spaces.Box):
            self.action_ph_spec = (prod(self.action_space.shape),
                                   "DiagGaussian")
        else:
            raise ValueError("specified an unsupported action space {}".format(
                self.action_space))
Exemple #5
0
def ips_eval(batch_weights, batch_rewards, gamma, max_len=200):
    assert len(batch_weights) == len(
        batch_rewards), "length of weights must be same with length of rewards"

    batch_size = len(batch_weights)
    MAX_LEN = max_len
    all_weights_stepwise_arr = np.empty((batch_size, MAX_LEN))
    all_weights_stepwise_arr[:] = np.nan

    all_weights = []
    batch_ips_scores = []
    batch_ips_scores_stepwise = []

    for i_traj, (weights,
                 rewards) in enumerate(zip(batch_weights, batch_rewards)):

        w_init = prod(weights)
        all_weights.append(w_init)
        w_init_stepwise = 1.0

        w_steps = np.ones_like(weights)
        w_cum_tmp = 1.0
        w_steps_stepwise = []

        for i_step, w in enumerate(weights):
            w_cum_tmp *= w
            all_weights_stepwise_arr[i_traj, i_step] = w_cum_tmp
            w_steps_stepwise.append(w_cum_tmp)

        ips_score = [
            w_init * w_step * (gamma**idx) * r
            for idx, (w_step, r) in enumerate(zip(w_steps, rewards))
        ]
        ips_score_stepwise = [
            w_init_stepwise * w_step * (gamma**idx) * r
            for idx, (w_step, r) in enumerate(zip(w_steps_stepwise, rewards))
        ]

        batch_ips_scores.append(ips_score)
        batch_ips_scores_stepwise.append(ips_score_stepwise)

    avg_weights = np.mean(all_weights)

    avg_weights_stepwise_mean = np.nanmean(all_weights_stepwise_arr, 0)
    avg_weights_stepwise_mean = avg_weights_stepwise_mean[
        ~np.isnan(avg_weights_stepwise_mean)]

    output_per_traj = [np.sum(ips_score) for ips_score in batch_ips_scores]
    output_per_traj_stepwise = [
        np.sum(ips_score) for ips_score in batch_ips_scores_stepwise
    ]
    output_norm_per_traj = [
        np.sum(ips_score) / avg_weights for ips_score in batch_ips_scores
    ]

    output_norm_per_traj_stepwise = []
    output_norm_per_traj_stepwise_mean = []
    for ips_score in batch_ips_scores_stepwise:
        batch_ips = []
        for ips, step_weight in zip(ips_score, avg_weights_stepwise_mean):
            avg_weights_stepwise = ips / step_weight
            batch_ips.append(avg_weights_stepwise)
        output_norm_per_traj_stepwise.append(np.sum(batch_ips))
        output_norm_per_traj_stepwise_mean.append(
            np.sum(batch_ips) / len(batch_ips))

    ips = np.mean(output_per_traj)
    ips_stepwise = np.mean(output_per_traj_stepwise)
    wips = np.mean(output_norm_per_traj)
    wips_stepwise = np.mean(output_norm_per_traj_stepwise)
    wips_stepwise_mean = np.mean(output_norm_per_traj_stepwise_mean)

    return ips, ips_stepwise, wips, wips_stepwise, wips_stepwise_mean