def _restore_one_channel(self, sp, data, start_index): """Extract the data of one channel from the flattened data. Arguments: sp (obj): basic `gym.spaces.space` object. data (obj): a numpy array of flattened observations. start_index (int): indicating the starting index of this channel. Returns: selected_data (obj): a numpy array of this channel's data. dim (int): the dimensionality of this channel's data. """ if isinstance(sp, gym.spaces.Box): dtype = np.float32 dim = prod(sp.shape) elif isinstance(sp, gym.spaces.Discrete): dtype = np.int32 dim = sp.n elif isinstance(sp, gym.spaces.MultiDiscrete): dtype = np.int32 dim = prod(sp.shape) elif isinstance(sp, gym.spaces.MultiBinary): dtype = np.int32 dim = prod(sp.shape) selected_data = np.asarray(data[:, start_index:start_index + dim]).astype(dtype) return selected_data, dim
def ips_eval_old(batch_weights, batch_rewards, gamma): assert len(batch_weights) == len( batch_rewards), "length of weights must be same with length of rewards" all_weights = [] all_weights_stepwise = [] batch_ips_scores = [] batch_ips_scores_stepwise = [] for weights, rewards in zip(batch_weights, batch_rewards): w_init = prod(weights) all_weights.append(w_init) w_init_stepwise = 1.0 w_steps = np.ones_like(weights) w_cum_tmp = 1.0 w_steps_stepwise = [] for w in weights: w_cum_tmp *= w w_steps_stepwise.append(w_cum_tmp) all_weights_stepwise.append(w_cum_tmp) ips_score = [ w_init * w_step * (gamma**idx) * r for idx, (w_step, r) in enumerate(zip(w_steps, rewards)) ] ips_score_stepwise = [ w_init_stepwise * w_step * (gamma**idx) * r for idx, (w_step, r) in enumerate(zip(w_steps_stepwise, rewards)) ] batch_ips_scores.append(ips_score) batch_ips_scores_stepwise.append(ips_score_stepwise) avg_weights = np.mean(all_weights) avg_weights_stepwise = np.mean(all_weights_stepwise) output_per_traj = [np.sum(ips_score) for ips_score in batch_ips_scores] output_per_traj_stepwise = [ np.sum(ips_score) for ips_score in batch_ips_scores_stepwise ] output_norm_per_traj = [ np.sum(ips_score) / avg_weights for ips_score in batch_ips_scores ] output_norm_per_traj_stepwise = [ np.sum(ips_score) / avg_weights_stepwise for ips_score in batch_ips_scores_stepwise ] ips = np.mean(output_per_traj) ips_stepwise = np.mean(output_per_traj_stepwise) wips = np.mean(output_norm_per_traj) wips_stepwise = np.mean(output_norm_per_traj_stepwise) return ips, ips_stepwise, wips, wips_stepwise
def _basic_space_to_ph_spec(self, sp): """Translate a gym space object to a tuple to specify data type and shape. Arguments: sp (obj): basic space object of gym interface. Returns: a tuple used for building TensorFlow placeholders where the first element specifies `dtype` and the second one specifies `shape`. """ # (jones.wz) TO DO: handle gym Atari input if isinstance(sp, gym.spaces.Box): if len(sp.shape) == 3: return (tf.uint8, (None, ) + sp.shape) return (tf.float32, (None, prod(sp.shape))) elif isinstance(sp, gym.spaces.Discrete): return (tf.int32, (None, sp.n)) elif isinstance(sp, gym.spaces.MultiDiscrete): return (tf.int32, (None, prod(sp.shape))) elif isinstance(sp, gym.spaces.MultiBinary): return (tf.int32, (None, prod(sp.shape))) else: raise TypeError( "specified an unsupported space type {}".format(sp))
def _prepare_ph_spec(self): """Build the TensorFlow placeholders according to the `observation_space`. Forbid multi-channel observations where any individual channel is recursively defined as a multi-channel observation. `_prepare_ph_spec()` can easily handle recursively defined observations, but they introduce unnecessary complexity to the TensorFlow FIFOqueues used for data exchanging in distributed setting. """ if isinstance(self.observation_space, gym.spaces.Tuple): self._is_single_channel = False self.ob_ph_spec = list() for sp in self.observation_space.spaces: assert type(sp) not in [ gym.spaces.Tuple, gym.spaces.Dict ], "forbidden type {}".format(self.observation_space) self.ob_ph_spec.append(self._basic_space_to_ph_spec(sp)) self.flattened_ob_shape = (np.sum( [s[1][1] for s in self.ob_ph_spec]), ) elif isinstance(self.observation_space, gym.spaces.Dict): self._is_single_channel = False self.ob_ph_spec = OrderedDict() for sp_name, sp in self.observation_space.spaces.items(): assert type(sp) not in [ gym.spaces.Tuple, gym.spaces.Dict ], "forbidden type {}".format(self.observation_space) self.ob_ph_spec[sp_name] = self._basic_space_to_ph_spec(sp) self.flattened_ob_shape = (np.sum( [s[1][1] for s in self.ob_ph_spec.values()]), ) else: self._is_single_channel = True self.ob_ph_spec = self._basic_space_to_ph_spec( self.observation_space) self.flattened_ob_shape = self.ob_ph_spec[1][1:] if isinstance(self.action_space, gym.spaces.Discrete): self.action_ph_spec = (self.action_space.n, "Categorical") elif isinstance(self.action_space, gym.spaces.Box): self.action_ph_spec = (prod(self.action_space.shape), "DiagGaussian") else: raise ValueError("specified an unsupported action space {}".format( self.action_space))
def ips_eval(batch_weights, batch_rewards, gamma, max_len=200): assert len(batch_weights) == len( batch_rewards), "length of weights must be same with length of rewards" batch_size = len(batch_weights) MAX_LEN = max_len all_weights_stepwise_arr = np.empty((batch_size, MAX_LEN)) all_weights_stepwise_arr[:] = np.nan all_weights = [] batch_ips_scores = [] batch_ips_scores_stepwise = [] for i_traj, (weights, rewards) in enumerate(zip(batch_weights, batch_rewards)): w_init = prod(weights) all_weights.append(w_init) w_init_stepwise = 1.0 w_steps = np.ones_like(weights) w_cum_tmp = 1.0 w_steps_stepwise = [] for i_step, w in enumerate(weights): w_cum_tmp *= w all_weights_stepwise_arr[i_traj, i_step] = w_cum_tmp w_steps_stepwise.append(w_cum_tmp) ips_score = [ w_init * w_step * (gamma**idx) * r for idx, (w_step, r) in enumerate(zip(w_steps, rewards)) ] ips_score_stepwise = [ w_init_stepwise * w_step * (gamma**idx) * r for idx, (w_step, r) in enumerate(zip(w_steps_stepwise, rewards)) ] batch_ips_scores.append(ips_score) batch_ips_scores_stepwise.append(ips_score_stepwise) avg_weights = np.mean(all_weights) avg_weights_stepwise_mean = np.nanmean(all_weights_stepwise_arr, 0) avg_weights_stepwise_mean = avg_weights_stepwise_mean[ ~np.isnan(avg_weights_stepwise_mean)] output_per_traj = [np.sum(ips_score) for ips_score in batch_ips_scores] output_per_traj_stepwise = [ np.sum(ips_score) for ips_score in batch_ips_scores_stepwise ] output_norm_per_traj = [ np.sum(ips_score) / avg_weights for ips_score in batch_ips_scores ] output_norm_per_traj_stepwise = [] output_norm_per_traj_stepwise_mean = [] for ips_score in batch_ips_scores_stepwise: batch_ips = [] for ips, step_weight in zip(ips_score, avg_weights_stepwise_mean): avg_weights_stepwise = ips / step_weight batch_ips.append(avg_weights_stepwise) output_norm_per_traj_stepwise.append(np.sum(batch_ips)) output_norm_per_traj_stepwise_mean.append( np.sum(batch_ips) / len(batch_ips)) ips = np.mean(output_per_traj) ips_stepwise = np.mean(output_per_traj_stepwise) wips = np.mean(output_norm_per_traj) wips_stepwise = np.mean(output_norm_per_traj_stepwise) wips_stepwise_mean = np.mean(output_norm_per_traj_stepwise_mean) return ips, ips_stepwise, wips, wips_stepwise, wips_stepwise_mean