def optimize_policy(self, itr, samples_data): """Optimize network using experiences from replay buffer. Args: itr (int): Iterations. samples_data (list): Processed batch data. Returns: numpy.float64: Loss of policy. """ del itr del samples_data transitions = self.replay_buffer.sample(self.buffer_batch_size) observations = transitions['observation'] rewards = transitions['reward'] actions = transitions['action'] next_observations = transitions['next_observation'] dones = transitions['terminal'] # normalize pixel to range [0, 1] since the samples stored in the # replay buffer are of type uint8 and not normalized, for memory # optimization observations = normalize_pixel_batch(self.env_spec, observations) next_observations = normalize_pixel_batch(self.env_spec, next_observations) loss, _ = self._train_qf(observations, actions, rewards, dones, next_observations) return loss
def optimize_policy(self, itr, observations, rewards, actions, next_observations, dones, jole_obs, jole_actions): """Optimize network using experiences from replay buffer.""" # normalize pixel to range [0, 1] since the samples stored in the # replay buffer are of type uint8 and not normalized, for memory # optimization observations = normalize_pixel_batch(self.env_spec, observations) next_observations = normalize_pixel_batch(self.env_spec, next_observations) loss, _, qval, y = self._train_qf(observations, actions, rewards, dones, next_observations, jole_obs, jole_actions, self.use_jole_qf, self.jole_clip_return_max, self.jole_clip_return_min) return loss, qval, y
def optimize_policy(self, itr, sample_data): """Optimize network using experiences from replay buffer.""" transitions = self.replay_buffer.sample(self.buffer_batch_size) observations = transitions['observation'] rewards = transitions['reward'] actions = transitions['action'] next_observations = transitions['next_observation'] dones = transitions['terminal'] # normalize pixel to range [0, 1] since the samples stored in the # replay buffer are of type uint8 and not normalized, for memory # optimization observations = normalize_pixel_batch(self.env_spec, observations) next_observations = normalize_pixel_batch(self.env_spec, next_observations) loss, _ = self._train_qf(observations, actions, rewards, dones, next_observations) return loss
def fit(self, paths): """Fit regressor based on paths. Args: paths (dict[numpy.ndarray]): Sample paths. """ observations = np.concatenate([p['observations'] for p in paths]) if isinstance(self.env_spec.observation_space, akro.Image): observations = normalize_pixel_batch(observations) returns = np.concatenate([p['returns'] for p in paths]) self._regressor.fit(observations, returns.reshape((-1, 1)))
def predict(self, path): """Predict value based on paths. Args: path (dict[numpy.ndarray]): Sample paths. Returns: numpy.ndarray: Predicted value. """ observations = path['observations'] if isinstance(self.env_spec.observation_space, akro.Image): observations = normalize_pixel_batch(observations) return self._regressor.predict(observations).flatten()
def predict(self, paths): """Predict ys based on input xs. Args: paths (dict[numpy.ndarray]): Sample paths. Return: numpy.ndarray: The predicted ys. """ xs = paths['observations'] if isinstance(self.env_spec.observation_space, akro.Image): xs = normalize_pixel_batch(xs) return self._f_predict(xs).flatten()
def fit(self, paths): """Fit regressor based on paths. Args: paths (dict[numpy.ndarray]): Sample paths. """ xs = np.concatenate([p['observations'] for p in paths]) if isinstance(self.env_spec.observation_space, akro.Image): xs = normalize_pixel_batch(xs) ys = np.concatenate([p['returns'] for p in paths]) ys = ys.reshape((-1, 1)) if self._subsample_factor < 1: num_samples_tot = xs.shape[0] idx = np.random.randint( 0, num_samples_tot, int(num_samples_tot * self._subsample_factor)) xs, ys = xs[idx], ys[idx] if self._normalize_inputs: # recompute normalizing constants for inputs self._x_mean.load(np.mean(xs, axis=0, keepdims=True)) self._x_std.load(np.std(xs, axis=0, keepdims=True) + 1e-8) self._old_network.x_mean.load(np.mean(xs, axis=0, keepdims=True)) self._old_network.x_std.load( np.std(xs, axis=0, keepdims=True) + 1e-8) if self._normalize_outputs: # recompute normalizing constants for outputs self._y_mean.load(np.mean(ys, axis=0, keepdims=True)) self._y_std.load(np.std(ys, axis=0, keepdims=True) + 1e-8) self._old_network.y_mean.load(np.mean(ys, axis=0, keepdims=True)) self._old_network.y_std.load( np.std(ys, axis=0, keepdims=True) + 1e-8) inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) tabular.record('{}/LossBefore'.format(self._name), loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) tabular.record('{}/LossAfter'.format(self._name), loss_after) if self._use_trust_region: tabular.record('{}/MeanKL'.format(self._name), self._optimizer.constraint_val(inputs)) tabular.record('{}/dLoss'.format(self._name), loss_before - loss_after) self._old_model.parameters = self.parameters
def test_normalize_pixel_batch(self): env = GarageEnv(DummyDiscretePixelEnv(), is_image=True) obs = env.reset() obs_normalized = normalize_pixel_batch(obs) expected = [ob / 255.0 for ob in obs] assert np.allclose(obs_normalized, expected)
def test_normalize_pixel_patch_not_trigger(self): env = TfEnv(DummyBoxEnv()) obs = env.reset() obs_normalized = normalize_pixel_batch(env, obs) assert np.array_equal(obs, obs_normalized)
def test_normalize_pixel_patch(self): env = TfEnv(DummyDiscretePixelEnv()) obs = env.reset() obs_normalized = normalize_pixel_batch(env, obs) expected = [ob / 255.0 for ob in obs] assert np.allclose(obs_normalized, expected)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. whole_paths(bool): Not effective. Only keep here to comply with base class. Returns: list: A list of paths. """ assert batch_size is not None paths = [] if not self._no_reset or self._last_obses is None: obses = self._vec_env.reset() else: obses = self._last_obses completes = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(completes) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self._env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) completes = env_infos['vec_env_executor.complete'] self._last_obses = next_obses agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obses, action=actions, reward=rewards, terminal=dones, next_observation=next_obses, ) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = next_obses return paths
def obtain_samples_for_evaluation(self, num_paths=20): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] policy = self.algo.policy for i in range(num_paths): obses = self.evaluate_env.reset() #print(obses) dones = np.asarray([True] * self.evaluate_env.num_envs) running_paths = [None] * self.evaluate_env.num_envs policy.reset(dones) end_of_path = False for j in range(500): input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) obses = obs_normalized actions = self.algo.policy.get_actions(obs_normalized) if len(actions) > 1: actions = actions[0] agent_infos = None next_obses, rewards, dones, env_infos = self.evaluate_env.step( actions) original_next_obses = next_obses next_obses = tensor_utils.normalize_pixel_batch( self.env_spec, next_obses) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if agent_infos is None: agent_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] if env_infos is None: env_infos = [ dict() for _ in range(self.evaluate_env.num_envs) ] for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=0, # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=0, success_count=0) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 if done or j == 499: paths.append( dict(rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx] ['success_count'])) running_paths[idx] = None end_of_path = True if end_of_path: break obses = original_next_obses #print(paths) return paths
def obtain_samples(self, itr, batch_size, is_evaluate=False): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) obses = obs_normalized if self.algo.es and not is_evaluate: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) else: actions = self.algo.policy.get_actions(obs_normalized) if len(actions) > 1: actions = actions[0] agent_infos = None next_obses, rewards, dones, env_infos = self.vec_env.step(actions) original_next_obses = next_obses next_obses = tensor_utils.normalize_pixel_batch( self.env_spec, next_obses) self._last_obses = next_obses env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: self.algo.replay_buffer.add_transitions( observation=obs_normalized, action=actions, reward=rewards * self.algo.reward_scale, terminal=dones, next_observation=next_obses, ) if self._bound_start == False: self._bound_start = True self._obs_upper = obses[0] self._obs_lower = obses[0] self._action_upper = actions[0] self._action_lower = actions[0] for obs in obses: self._obs_upper = np.maximum(self._obs_upper, obs) self._obs_lower = np.minimum(self._obs_lower, obs) for action in actions: self._action_upper = np.maximum(self._action_upper, action) self._action_lower = np.minimum(self._action_lower, action) for idx, reward, env_info, done in zip(itertools.count(), rewards, env_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), dones=tensor_utils.stack_tensor_list( running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() obses = original_next_obses return paths, self._obs_upper, self._obs_lower, self._action_upper, self._action_lower
def obtain_samples(self, itr, batch_size): """Collect samples for the given iteration number. Args: itr(int): Iteration number. batch_size(int): Number of environment interactions in one batch. Returns: list: A list of paths. """ paths = [] if not self.no_reset or self._last_obses is None: obses = self.vec_env.reset() else: obses = self._last_obses dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs n_samples = 0 policy = self.algo.policy if self.algo.es: self.algo.es.reset() while n_samples < batch_size: policy.reset(dones) if self.algo.input_include_goal: obs = [obs['observation'] for obs in obses] d_g = [obs['desired_goal'] for obs in obses] a_g = [obs['achieved_goal'] for obs in obses] input_obses = np.concatenate((obs, d_g), axis=-1) else: input_obses = obses obs_normalized = tensor_utils.normalize_pixel_batch( self.env_spec, input_obses) if self.algo.es: actions, agent_infos = self.algo.es.get_actions( itr, obs_normalized, self.algo.policy) else: actions, agent_infos = self.algo.policy.get_actions( obs_normalized) next_obses, rewards, dones, env_infos = self.vec_env.step(actions) new_episode_obs = None if "reset_new_obs" in env_infos: new_episode_obs = next_obses.copy() for i, reset_new_obs in env_infos["reset_new_obs"][0]: new_episode_obs[i] = reset_new_obs del env_infos["reset_new_obs"] #self.vec_env.envs[0].render() env_infos = tensor_utils.split_tensor_dict_list(env_infos) n_samples += len(next_obses) if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if self.algo.input_include_goal: self.algo.replay_buffer.add_transitions( observation=obs, action=actions, goal=d_g, achieved_goal=a_g, terminal=dones, next_observation=[ next_obs['observation'] for next_obs in next_obses ], next_achieved_goal=[ next_obs['achieved_goal'] for next_obs in next_obses ], ) else: payload = { "observation": obses, "action": actions, "reward": rewards * self.algo.reward_scale, "terminal": dones, "next_observation": next_obses } if env_infos and env_infos[0].get("ground_truth_state") is not None: payload["ground_truth_state"] = [env_info.get("ground_truth_state") for env_info in env_infos] self.algo.replay_buffer.add_transitions( **payload ) for idx, reward, env_info, q_val, done in zip(itertools.count(), rewards, env_infos, agent_infos["q_vals"], dones): if running_paths[idx] is None: running_paths[idx] = dict( rewards=[], env_infos=[], dones=[], q_vals=self._last_q_vals[idx].copy(), undiscounted_return=self._last_uncounted_discount[idx], # running_length: Length of path up to now # Note that running_length is not len(rewards) # Because a path may not be complete in one batch running_length=self._last_running_length[idx], success_count=self._last_success_count[idx]) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['dones'].append(done) running_paths[idx]['q_vals'].append(q_val) running_paths[idx]['running_length'] += 1 running_paths[idx]['undiscounted_return'] += reward running_paths[idx]['success_count'] += env_info.get( 'is_success') or 0 self._last_q_vals[idx].append(q_val) self._last_uncounted_discount[idx] += reward self._last_success_count[idx] += env_info.get( 'is_success') or 0 self._last_running_length[idx] += 1 if done or n_samples >= batch_size: paths.append( dict( rewards=np.asarray(running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), q_vals=np.asarray(running_paths[idx]["q_vals"]), running_length=running_paths[idx] ['running_length'], undiscounted_return=running_paths[idx] ['undiscounted_return'], success_count=running_paths[idx]['success_count'])) running_paths[idx] = None if done: self._last_q_vals[idx] = [] self._last_running_length[idx] = 0 self._last_success_count[idx] = 0 self._last_uncounted_discount[idx] = 0 if self.algo.es: self.algo.es.reset() if new_episode_obs: obses = new_episode_obs else: obses = next_obses self._last_obses = obses return paths