def obtain_samples(self, itr, num_samples): """Sample the policy for new trajectories. Args: - itr(int): iteration number - num_samples(int):number of steps the the sampler should collect """ self._active_workers = [] self._active_worker_ids = [] pbar = ProgBarCounter(num_samples) completed_samples = 0 traj = [] updating_workers = [] # update the policy params of each worker before sampling # for the current iteration self._idle_worker_ids = list(range(self._num_workers)) curr_policy_params = self._algo.policy.get_param_values() params_id = ray.put(curr_policy_params) while self._idle_worker_ids: worker_id = self._idle_worker_ids.pop() worker = self._all_workers[worker_id] updating_workers.append(worker.set_agent.remote(params_id)) while completed_samples < num_samples: # if there are workers still being updated, check # which ones are still updating and take the workers that # are done updating, and start collecting trajectories on # those workers. if updating_workers: updated, updating_workers = ray.wait(updating_workers, num_returns=1, timeout=0.1) upd = [ray.get(up) for up in updated] self._idle_worker_ids.extend(upd) # if there are idle workers, use them to collect trajectories # mark the newly busy workers as active while self._idle_worker_ids: idle_worker_id = self._idle_worker_ids.pop() self._active_worker_ids.append(idle_worker_id) worker = self._all_workers[idle_worker_id] self._active_workers.append(worker.rollout.remote()) # check which workers are done/not done collecting a sample # if any are done, send them to process the collected trajectory # if they are not, keep checking if they are done ready, not_ready = ray.wait(self._active_workers, num_returns=1, timeout=0.001) self._active_workers = not_ready for result in ready: trajectory, num_returned_samples = self._process_trajectory( result) completed_samples += num_returned_samples pbar.inc(num_returned_samples) traj.append(trajectory) pbar.stop() return traj
def eval(self, policy, n_episodes=20, greedy=True, load_from_file=False, save_replay=False): if load_from_file: logger.add_output(dowel.StdOutput()) logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format( n_episodes, greedy)) n_won = 0 episode_rewards = [] pbar = ProgBarCounter(n_episodes) for e in range(n_episodes): obs = self.reset() policy.reset([True]) info = {'battle_won': False} terminated = False episode_rewards.append(0) while not terminated: obs = np.array([obs]) # add [.] for vec_env avail_actions = np.array([self.get_avail_actions()]) actions, agent_infos = policy.get_actions(obs, avail_actions, greedy=greedy) obs, reward, terminated, info = self.step(actions[0]) if not self.centralized: terminated = all(terminated) episode_rewards[-1] += np.mean(reward) pbar.inc(1) if save_replay: self.save_replay() # If case SC2 restarts during eval, KeyError: 'battle_won' can happen # Take precaution if type(info) == dict: if 'battle_won' in info.keys(): n_won += 1 if info['battle_won'] else 0 pbar.stop() policy.reset([True]) win_rate = n_won / n_episodes avg_return = np.mean(episode_rewards) logger.log('EvalWinRate: {}'.format(win_rate)) logger.log('EvalAvgReturn: {}'.format(avg_return)) if not load_from_file: tabular.record('EvalWinRate', win_rate) tabular.record('EvalAvgReturn', avg_return)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Obtain samples.""" logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self.n_envs paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]['observations']), actions=self.env_spec.action_space.flatten_n( running_paths[idx]['actions']), rewards=tensor_utils.stack_tensor_list( running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) if whole_paths: return paths else: paths_truncated = truncate_paths(paths, batch_size) return paths_truncated
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths, each path with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray(running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def obtain_samples(self, itr, num_samples, agent_update, env_update=None): """Sample the policy for new trajectories. Args: itr(int): Iteration number. num_samples(int): Number of steps the the sampler should collect. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: list[dict]: Sample paths, each path with key * observations: (numpy.ndarray) * actions: (numpy.ndarray) * rewards: (numpy.ndarray) * agent_infos: (dict) * env_infos: (dict) """ active_workers = [] active_worker_ids = [] idle_worker_ids = list(range(self._num_workers)) pbar = ProgBarCounter(num_samples) completed_samples = 0 traj = [] updating_workers = [] # update the policy params of each worker before sampling # for the current iteration idle_worker_ids = list(range(self._num_workers)) param_ids = self._worker_factory.prepare_worker_messages( agent_update, ray.put) env_ids = self._worker_factory.prepare_worker_messages( env_update, ray.put) while idle_worker_ids: worker_id = idle_worker_ids.pop() worker = self._all_workers[worker_id] updating_workers.append( worker.update.remote(param_ids[worker_id], env_ids[worker_id])) while completed_samples < num_samples: # if there are workers still being updated, check # which ones are still updating and take the workers that # are done updating, and start collecting trajectories on # those workers. if updating_workers: updated, updating_workers = ray.wait(updating_workers, num_returns=1, timeout=0.1) upd = [ray.get(up) for up in updated] idle_worker_ids.extend(upd) # if there are idle workers, use them to collect trajectories # mark the newly busy workers as active while idle_worker_ids: idle_worker_id = idle_worker_ids.pop() active_worker_ids.append(idle_worker_id) worker = self._all_workers[idle_worker_id] active_workers.append(worker.rollout.remote()) # check which workers are done/not done collecting a sample # if any are done, send them to process the collected trajectory # if they are not, keep checking if they are done ready, not_ready = ray.wait(active_workers, num_returns=1, timeout=0.001) active_workers = not_ready for result in ready: trajectory, num_returned_samples = _process_trajectory( result, active_worker_ids, idle_worker_ids) completed_samples += num_returned_samples pbar.inc(num_returned_samples) traj.append(trajectory) pbar.stop() return traj
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: list[dict]: Sample paths. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape [Batch, *obs_dims] * actions: numpy.ndarray with shape [Batch, *act_dims] * rewards: numpy.ndarray with shape [Batch, ] * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape [Batch, ?]. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape [Batch, ?]. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. * dones: numpy.ndarray with shape [Batch, ] """ logger.log('Obtaining samples for iteration %d...' % itr) if not batch_size: batch_size = self.algo.max_path_length * self._n_envs paths = [] n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy while n_samples < batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = \ self._vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self._vec_env.num_envs)] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict(observations=[], actions=[], rewards=[], env_infos=[], agent_infos=[], dones=[]) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) running_paths[idx]['dones'].append(done) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray(running_paths[idx]['rewards']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['agent_infos']), dones=np.asarray(running_paths[idx]['dones']))) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)
def run_collect(self, collect_once, threshold, args=None, show_prog_bar=True): """ Run the collector method using the worker pool. The collect_once method will receive 'g' as its first argument, followed by the provided args, if any. The method should return a pair of values. The first should be the object to be collected, and the second is the increment to be added. This will continue until the total increment reaches or exceeds the given threshold. Sample script: def collect_once(g): return 'a', 1 stateful_pool.run_collect(collect_once, threshold=3) # should return ['a', 'a', 'a'] :param collector: :param threshold: :return: """ assert not inspect.ismethod(collect_once), ( 'run_collect() cannot run a class method. Please ensure that ' "collect_once is a function with the prototype 'def foo(g, ...)', " 'where g is an object of type ' 'garage.sampler.stateful_pool.SharedGlobal') if args is None: args = tuple() if self.pool: counter = self.manager.Value('i', 0) lock = self.manager.RLock() results = self.pool.map_async(_worker_run_collect, [ (collect_once, counter, lock, threshold, args) ] * self.n_parallel) if show_prog_bar: pbar = ProgBarCounter(threshold) last_value = 0 while True: time.sleep(0.1) with lock: if counter.value >= threshold: if show_prog_bar: pbar.stop() break if show_prog_bar: pbar.inc(counter.value - last_value) last_value = counter.value return sum(results.get(), []) else: count = 0 results = [] if show_prog_bar: pbar = ProgBarCounter(threshold) while count < threshold: result, inc = collect_once(self.G, *args) results.append(result) count += inc if show_prog_bar: pbar.inc(inc) if show_prog_bar: pbar.stop() return results return []
def obtain_exact_trajectories(self, n_traj_per_worker, agent_update, env_update=None): """Sample an exact number of trajectories per worker. Args: n_traj_per_worker (int): Exact number of trajectories to gather for each worker. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: TrajectoryBatch: Batch of gathered trajectories. Always in worker order. In other words, first all trajectories from worker 0, then all trajectories from worker 1, etc. """ active_workers = [] pbar = ProgBarCounter(self._worker_factory.n_workers) trajectories = defaultdict(list) # update the policy params of each worker before sampling # for the current iteration idle_worker_ids = [] updating_workers = self._update_workers(agent_update, env_update) while any( len(trajectories[i]) < n_traj_per_worker for i in range(self._worker_factory.n_workers)): # if there are workers still being updated, check # which ones are still updating and take the workers that # are done updating, and start collecting trajectories on # those workers. if updating_workers: updated, updating_workers = ray.wait(updating_workers, num_returns=1, timeout=0.1) upd = [ray.get(up) for up in updated] idle_worker_ids.extend(upd) # if there are idle workers, use them to collect trajectories # mark the newly busy workers as active while idle_worker_ids: idle_worker_id = idle_worker_ids.pop() worker = self._all_workers[idle_worker_id] active_workers.append(worker.rollout.remote()) # check which workers are done/not done collecting a sample # if any are done, send them to process the collected trajectory # if they are not, keep checking if they are done ready, not_ready = ray.wait(active_workers, num_returns=1, timeout=0.001) active_workers = not_ready for result in ready: ready_worker_id, trajectory_batch = ray.get(result) pbar.inc(1) trajectories[ready_worker_id].append(trajectory_batch) if len(trajectories[ready_worker_id]) < n_traj_per_worker: idle_worker_ids.append(ready_worker_id) pbar.stop() ordered_trajectories = list( itertools.chain(*[ trajectories[i] for i in range(self._worker_factory.n_workers) ])) return TrajectoryBatch.concatenate(*ordered_trajectories)
def obtain_samples(self, itr, num_samples, agent_update, env_update=None): """Sample the policy for new trajectories. Args: itr(int): Iteration number. num_samples(int): Number of steps the the sampler should collect. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: TrajectoryBatch: Batch of gathered trajectories. """ active_workers = [] pbar = ProgBarCounter(num_samples) completed_samples = 0 batches = [] # update the policy params of each worker before sampling # for the current iteration idle_worker_ids = [] updating_workers = self._update_workers(agent_update, env_update) while completed_samples < num_samples: # if there are workers still being updated, check # which ones are still updating and take the workers that # are done updating, and start collecting trajectories on # those workers. if updating_workers: updated, updating_workers = ray.wait(updating_workers, num_returns=1, timeout=0.1) upd = [ray.get(up) for up in updated] idle_worker_ids.extend(upd) # if there are idle workers, use them to collect trajectories # mark the newly busy workers as active while idle_worker_ids: idle_worker_id = idle_worker_ids.pop() worker = self._all_workers[idle_worker_id] active_workers.append(worker.rollout.remote()) # check which workers are done/not done collecting a sample # if any are done, send them to process the collected trajectory # if they are not, keep checking if they are done ready, not_ready = ray.wait(active_workers, num_returns=1, timeout=0.001) active_workers = not_ready for result in ready: ready_worker_id, trajectory_batch = ray.get(result) idle_worker_ids.append(ready_worker_id) num_returned_samples = trajectory_batch.lengths.sum() completed_samples += num_returned_samples pbar.inc(num_returned_samples) batches.append(trajectory_batch) pbar.stop() return TrajectoryBatch.concatenate(*batches)
def obtain_exact_trajectories(self, n_traj_per_worker, agent_update, env_update=None): """Sample an exact number of trajectories per worker. Args: n_traj_per_worker (int): Exact number of trajectories to gather for each worker. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: TrajectoryBatch: Batch of gathered trajectories. Always in worker order. In other words, first all trajectories from worker 0, then all trajectories from worker 1, etc. Raises: AssertionError: On internal errors. """ pbar = ProgBarCounter(self._factory.n_workers) self._agent_version += 1 updated_workers = set() agent_ups = self._factory.prepare_worker_messages( agent_update, cloudpickle.dumps) env_ups = self._factory.prepare_worker_messages(env_update) trajectories = defaultdict(list) while any( len(trajectories[i]) < n_traj_per_worker for i in range(self._factory.n_workers)): self._push_updates(updated_workers, agent_ups, env_ups) tag, contents = self._to_sampler.get() if tag == 'trajectory': batch, version, worker_n = contents if version == self._agent_version: if len(trajectories[worker_n]) < n_traj_per_worker: trajectories[worker_n].append(batch) if len(trajectories[worker_n]) == n_traj_per_worker: pbar.inc(1) try: self._to_worker[worker_n].put_nowait(('stop', ())) except queue.Full: pass else: raise AssertionError('Unknown tag {} with contents {}'.format( tag, contents)) for q in self._to_worker: try: q.put_nowait(('stop', ())) except queue.Full: pass pbar.stop() ordered_trajectories = list( itertools.chain( *[trajectories[i] for i in range(self._factory.n_workers)])) return TrajectoryBatch.concatenate(*ordered_trajectories)
def obtain_samples(self, itr, num_samples, agent_update, env_update=None): """Collect at least a given number transitions (timesteps). Args: itr(int): The current iteration number. Using this argument is deprecated. num_samples(int): Minimum number of transitions / timesteps to sample. agent_update(object): Value which will be passed into the `agent_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. env_update(object): Value which will be passed into the `env_update_fn` before doing rollouts. If a list is passed in, it must have length exactly `factory.n_workers`, and will be spread across the workers. Returns: garage.TrajectoryBatch: The batch of collected trajectories. Raises: AssertionError: On internal errors. """ del itr pbar = ProgBarCounter(num_samples) batches = [] completed_samples = 0 self._agent_version += 1 updated_workers = set() agent_ups = self._factory.prepare_worker_messages( agent_update, cloudpickle.dumps) env_ups = self._factory.prepare_worker_messages(env_update) while completed_samples < num_samples: self._push_updates(updated_workers, agent_ups, env_ups) for _ in range(self._factory.n_workers): try: tag, contents = self._to_sampler.get_nowait() if tag == 'trajectory': batch, version, worker_n = contents del worker_n if version == self._agent_version: batches.append(batch) num_returned_samples = batch.lengths.sum() completed_samples += num_returned_samples pbar.inc(num_returned_samples) else: # Receiving paths from previous iterations is # normal. Potentially, we could gather them here, # if an off-policy method wants them. pass else: raise AssertionError( 'Unknown tag {} with contents {}'.format( tag, contents)) except queue.Empty: pass for q in self._to_worker: try: q.put_nowait(('stop', ())) except queue.Full: pass pbar.stop() return TrajectoryBatch.concatenate(*batches)
def standard_eval(env, policy, n_episodes=20, greedy=True, load_from_file=False, render=False, recorder=None, max_steps=10000): if recorder is not None: render = False # force off if load_from_file: logger.add_output(dowel.StdOutput()) logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format( n_episodes, greedy)) episode_rewards = [] pbar = ProgBarCounter(n_episodes) for e in range(n_episodes): obs = env.reset() policy.reset([True]) terminated = False t = 0 episode_rewards.append(0) while not terminated: if render: env.render() # time.sleep(0.05) if recorder is not None: recorder.capture_frame() if not env.centralized: # obs.shape = (n_agents, n_envs, obs_dim) obs = torch.Tensor(obs).unsqueeze(1) # add n_envs dim avail_actions = torch.Tensor( env.get_avail_actions()).unsqueeze(1) actions, agent_infos = policy.get_actions(obs, avail_actions, greedy=greedy) if len(actions.shape) == 3: # n-d action actions = actions[:, 0, :] elif len(actions.shape) == 2: # 1-d action actions = actions[:, 0] obs, reward, terminated, info = env.step(actions) # n_env = 1 terminated = all(terminated) else: # obs.shape = (n_envs, n_agents * obs_dim) obs = np.array([obs]) avail_actions = np.array([env.get_avail_actions()]) actions, agent_infos = policy.get_actions(obs, avail_actions, greedy=greedy) obs, reward, terminated, info = env.step( actions[0]) # n_env = 1 t += 1 if t > max_steps: terminated = True episode_rewards[-1] += np.mean(reward) pbar.inc(1) pbar.stop() policy.reset([True]) avg_return = np.mean(episode_rewards) logger.log('EvalAvgReturn: {}'.format(avg_return)) if not load_from_file: tabular.record('EvalAvgReturn', avg_return)
def obtain_samples(self, itr): logger.log("Obtaining samples for iteration %d..." % itr) paths = [] n_samples = 0 obses = self.vec_env.reset() dones = np.asarray([True] * self.vec_env.num_envs) running_paths = [None] * self.vec_env.num_envs pbar = ProgBarCounter(self.algo.batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy import time while n_samples < self.algo.batch_size: t = time.time() policy.reset(dones) actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self.vec_env.step(actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self.vec_env.num_envs)] if agent_infos is None: agent_infos = [dict() for _ in range(self.vec_env.num_envs)] for idx, observation, action, reward, next_observation, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, next_obses, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], next_observations=[], env_infos=[], agent_infos=[], ) running_paths[idx]["observations"].append(observation) running_paths[idx]["actions"].append(action) running_paths[idx]["rewards"].append(reward) running_paths[idx]["next_observations"].append( next_observation) running_paths[idx]["env_infos"].append(env_info) running_paths[idx]["agent_infos"].append(agent_info) if done: paths.append( dict(observations=self.env_spec.observation_space. flatten_n(running_paths[idx]["observations"]), actions=self.env_spec.action_space.flatten_n( running_paths[idx]["actions"]), rewards=tensor_utils.stack_tensor_list( running_paths[idx]["rewards"]), next_observation=tensor_utils.stack_tensor_list( running_paths[idx]["next_observations"]), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["env_infos"]), agent_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]["agent_infos"]))) n_samples += len(running_paths[idx]["rewards"]) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() logger.record_tabular("PolicyExecTime", policy_time) logger.record_tabular("EnvExecTime", env_time) logger.record_tabular("ProcessExecTime", process_time) return paths
def eval(self, policy, n_episodes=20, greedy=True, load_from_file=False, max_steps=60): import dowel from dowel import logger, tabular from garage.misc.prog_bar_counter import ProgBarCounter if load_from_file: logger.add_output(dowel.StdOutput()) logger.log('Evaluating policy, {} episodes, greedy = {} ...'.format( n_episodes, greedy)) episode_rewards = [] success = 0 pbar = ProgBarCounter(n_episodes) for e in range(n_episodes): obs = self.reset() policy.reset([True]) terminated = False t = 0 episode_rewards.append(0) while not terminated: if not self.centralized: # obs.shape = (n_agents, n_envs, obs_dim) obs = torch.Tensor(obs).unsqueeze(1) # add n_envs dim avail_actions = torch.Tensor( self.get_avail_actions()).unsqueeze(1) actions, agent_infos = policy.get_actions(obs, avail_actions, greedy=greedy) if len(actions.shape) == 3: # n-d action actions = actions[:, 0, :] elif len(actions.shape) == 2: # 1-d action actions = actions[:, 0] obs, reward, terminated, info = self.step( actions) # n_env = 1 terminated = all(terminated) else: # obs.shape = (n_envs, n_agents * obs_dim) obs = np.array([obs]) avail_actions = np.array([self.get_avail_actions()]) actions, agent_infos = policy.get_actions(obs, avail_actions, greedy=greedy) obs, reward, terminated, info = self.step( actions[0]) # n_env = 1 t += 1 if t >= max_steps: terminated = True episode_rewards[-1] += np.mean(reward) # episode end success += self.stat['success'] pbar.inc(1) pbar.stop() policy.reset([True]) avg_return = np.mean(episode_rewards) success = success / n_episodes logger.log('EvalAvgReturn: {}'.format(avg_return)) logger.log('EvalSucessRate: {}'.format(success)) if not load_from_file: tabular.record('EvalAvgReturn', avg_return) tabular.record('EvalSucessRate', success)
def obtain_samples(self, itr, batch_size=None, whole_paths=True): """Sample the policy for new trajectories. If batch size is not specified, episode per task by default is 1 so batch size will be meta_batch_size * max_path_length. When number of workers are less than meta batch size, sampling will be performed for each of self._vec_envs_indices in series. The i-th value of self._vec_envs_indices represents the indices of the environments/tasks to be sampled for the i-th iteration. Args: itr (int): Iteration number. batch_size (int): Number of samples to be collected. If None, it will be default [algo.max_path_length * n_envs]. whole_paths (bool): Whether return all the paths or not. True by default. It's possible for the paths to have total actual sample size larger than batch_size, and will be truncated if this flag is true. Returns: OrderedDict: Sample paths. Key represents the index of the environment/task and value represents all the paths sampled from that particular environment/task. Note: Each path is a dictionary, with keys and values as following: * observations: numpy.ndarray with shape :math:`[N, S^*]` * actions: numpy.ndarray with shape :math:`[N, S^*]` * rewards: numpy.ndarray with shape :math:`[N, S^*]` * dones: numpy.ndarray with shape :math:`[N, S^*]` * env_infos: A dictionary with each key representing one environment info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "ale.lives" for atari environments. * agent_infos: A dictionary with each key representing one agent info, value being a numpy.ndarray with shape :math:`[N, S^*]`. One example is "prev_action", which is used for recurrent policy as previous action input, merged with the observation input as the state input. """ logger.log('Obtaining samples for iteration %d...' % itr) if batch_size is None: batch_size = self.algo.max_path_length * self._meta_batch_size paths = [] tasks = self.env.sample_tasks(self._meta_batch_size) # Start main loop batch_size_per_loop = batch_size // len(self._vec_envs_indices) for vec_envs_indices in self._vec_envs_indices: self._setup_worker(vec_envs_indices, tasks) n_samples = 0 obses = self._vec_env.reset() dones = np.asarray([True] * self._vec_env.num_envs) running_paths = [None] * self._vec_env.num_envs pbar = ProgBarCounter(batch_size) policy_time = 0 env_time = 0 process_time = 0 policy = self.algo.policy # Only reset policies at the beginning of a meta batch policy.reset(dones) while n_samples < batch_size_per_loop: t = time.time() actions, agent_infos = policy.get_actions(obses) policy_time += time.time() - t t = time.time() next_obses, rewards, dones, env_infos = self._vec_env.step( actions) env_time += time.time() - t t = time.time() agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) env_infos = tensor_utils.split_tensor_dict_list(env_infos) if env_infos is None: env_infos = [dict() for _ in range(self._vec_env.num_envs)] if agent_infos is None: agent_infos = [ dict() for _ in range(self._vec_env.num_envs) ] for idx, observation, action, reward, env_info, agent_info, done in zip( # noqa: E501 itertools.count(), obses, actions, rewards, env_infos, agent_infos, dones): if running_paths[idx] is None: running_paths[idx] = dict( observations=[], actions=[], rewards=[], dones=[], env_infos=[], agent_infos=[], ) running_paths[idx]['observations'].append(observation) running_paths[idx]['actions'].append(action) running_paths[idx]['rewards'].append(reward) running_paths[idx]['dones'].append(done) running_paths[idx]['env_infos'].append(env_info) running_paths[idx]['agent_infos'].append(agent_info) if done: obs = np.asarray(running_paths[idx]['observations']) actions = np.asarray(running_paths[idx]['actions']) paths.append( dict(observations=obs, actions=actions, rewards=np.asarray( running_paths[idx]['rewards']), dones=np.asarray(running_paths[idx]['dones']), env_infos=tensor_utils.stack_tensor_dict_list( running_paths[idx]['env_infos']), agent_infos=tensor_utils. stack_tensor_dict_list( running_paths[idx]['agent_infos']), batch_idx=idx)) n_samples += len(running_paths[idx]['rewards']) running_paths[idx] = None process_time += time.time() - t pbar.inc(len(obses)) obses = next_obses pbar.stop() tabular.record('PolicyExecTime', policy_time) tabular.record('EnvExecTime', env_time) tabular.record('ProcessExecTime', process_time) return paths if whole_paths else truncate_paths(paths, batch_size)