def policy(self) -> Policy: """ Generates (and caches) the push policy from the loaded model based on the given environment and client configuration """ # check if the policy has been cached if self._policy is not None: return self._policy # create the environment that the agent acts in env = Environment(self.config) # keep querying the agent until the policy is complete obs, action, reward, completed = env.observation, env.action_space.sample( ), 0, False # initialize LSTM state if applicable policy_map = self.agent.workers.local_worker().policy_map if hasattr( self.agent, "workers") else None state = policy_map[DEFAULT_POLICY_ID].get_initial_state( ) if policy_map else [] use_lstm = len(state) > 0 while not completed: # query the agent for an action if use_lstm: action, state, _ = self.agent.compute_action( obs, state=state, prev_action=action, prev_reward=reward) else: action = self.agent.compute_action(obs, prev_action=action, prev_reward=reward) action = _flatten_action(action) # deflate and apply the action obs, reward, completed, _ = env.step(action) self._policy = env.policy return self._policy
def __init__(self, observation_space, action_space, config): assert tf.executing_eagerly() Policy.__init__(self, observation_space, action_space, config) self._is_training = False self._loss_initialized = False self._sess = None if get_default_config: config = dict(get_default_config(), **config) if before_init: before_init(self, observation_space, action_space, config) self.config = config if action_sampler_fn: if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") self.dist_class = None else: self.dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if make_model: self.model = make_model(self, observation_space, action_space, config) else: self.model = ModelCatalog.get_model_v2( observation_space, action_space, logit_dim, config["model"], framework="tf", ) self.model({ SampleBatch.CUR_OBS: tf.convert_to_tensor( np.array([observation_space.sample()])), SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( [_flatten_action(action_space.sample())]), SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), }, [ tf.convert_to_tensor([s]) for s in self.model.get_initial_state() ], tf.convert_to_tensor([1])) if before_loss_init: before_loss_init(self, observation_space, action_space, config) self._initialize_loss_with_dummy_batch() self._loss_initialized = True if optimizer_fn: self._optimizer = optimizer_fn(self, config) else: self._optimizer = tf.train.AdamOptimizer(config["lr"]) if after_init: after_init(self, observation_space, action_space, config)
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, video_dir=None): # Adapted from https://github.com/AIcrowd/neurips2020-procgen-starter-kit/blob/master/rollout.py#L349 policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): #env = agent.workers.local_worker().env env = gym.make(env_name, render_mode="rgb_array") multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: agent.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(agent)) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } steps = 0 episodes = 0 rgb_array = [] while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 episode_steps = 0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) episode_steps += 1 if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: rgb_array.append(env.render(mode='rgb_array')) saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {} steps: {}".format( episodes, reward_total, episode_steps)) if done: episodes += 1 return rgb_array
def visualize_adversaries(config_out_dir, checkpoint_num, grid_size, num_rollouts, outdir, plot_base_case, extension): agent_list = [] index = 0 # max_index = 20000 multiagent = True for (dirpath, dirnames, filenames) in os.walk(config_out_dir): if "params.pkl" in filenames: # if index > max_index: # break rllib_config, checkpoint = get_config_from_path( dirpath, checkpoint_num) env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \ instantiate_rollout(rllib_config, checkpoint) agent_list.append(agent) # index += 1 # figure out how many adversaries you have and initialize their grids num_adversaries = env.num_adv_strengths * env.advs_per_strength if plot_base_case: policy_correlation_grid = np.zeros( (len(agent_list), len(agent_list) + 1)) else: policy_correlation_grid = np.zeros((len(agent_list), len(agent_list))) if plot_base_case: adversary_loop_index = len(agent_list) + 1 else: adversary_loop_index = len(agent_list) for agent_row_index in range(len(agent_list)): print('Outer index {}'.format(agent_row_index)) for adversary_col_index in range(adversary_loop_index): print('Inner index {}'.format(adversary_col_index)) reward_total = 0.0 for adversary_index in range(num_adversaries): env.curr_adversary = adversary_index # turn the adversaries off for the last column if adversary_col_index == len(agent_list): env.curr_adversary = -1 # actually do the rollouts for r_itr in range(num_rollouts): print('On iteration {}'.format(r_itr)) mapping_cache = { } # in case policy_agent_mapping is stochastic agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) obs = env.reset() prev_rewards = collections.defaultdict(lambda: 0.) done = False step_num = 0 while not done: multi_obs = obs if multiagent else { _DUMMY_AGENT_ID: obs } new_obs = {'agent': multi_obs['agent']} # turn the adversaries off for the last column if adversary_col_index < len(agent_list): new_obs.update({ 'adversary{}'.format(adversary_index): multi_obs['adversary{}'.format( adversary_index)] }) action_dict = {} for agent_id, a_obs in new_obs.items(): if 'agent' in agent_id: if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: prev_action = _flatten_action( prev_actions[agent_id]) a_action, p_state, _ = agent_list[ agent_row_index].compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_action, prev_reward=prev_rewards[ agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: prev_action = _flatten_action( prev_actions[agent_id]) flat_action = _flatten_action(a_obs) a_action = agent_list[ agent_row_index].compute_action( flat_action, prev_action=prev_action, prev_reward=prev_rewards[ agent_id], policy_id=policy_id) else: if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: prev_action = _flatten_action( prev_actions[agent_id]) a_action, p_state, _ = agent_list[ adversary_col_index].compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_action, prev_reward=prev_rewards[ agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: prev_action = _flatten_action( prev_actions[agent_id]) flat_action = _flatten_action(a_obs) a_action = agent_list[ adversary_col_index].compute_action( flat_action, prev_action=prev_action, prev_reward=prev_rewards[ agent_id], policy_id=policy_id) action_dict[agent_id] = a_action prev_action = _flatten_action( a_action) # tuple actions prev_actions[agent_id] = prev_action action = action_dict action = action if multiagent else action[ _DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if isinstance(done, dict): done = done['__all__'] step_num += 1 if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # we only want the robot reward, not the adversary reward reward_total += info['agent']['agent_reward'] obs = next_obs policy_correlation_grid[agent_row_index, adversary_col_index] = reward_total / ( num_rollouts * num_adversaries) file_path = os.path.dirname(os.path.abspath(__file__)) output_file_path = os.path.join(file_path, outdir) if not os.path.exists(output_file_path): try: os.makedirs(os.path.dirname(output_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise # increasing the row index implies moving down on the y axis plt.imshow(policy_correlation_grid, interpolation='nearest', cmap='seismic', aspect='equal', vmin=400, vmax=3600) plt.colorbar() fontsize = 14 title_fontsize = 16 plt.title('Policy Correlation Matrix', fontsize=title_fontsize) if plot_base_case: plt.yticks(ticks=np.arange(len(agent_list))) plt.xticks(ticks=np.arange(len(agent_list)).tolist().append(['base'])) else: plt.yticks(ticks=np.arange(len(agent_list))) plt.xticks(ticks=np.arange(len(agent_list))) plt.ylabel('agent index', fontsize=fontsize) plt.xlabel('adversary index', fontsize=fontsize) output_str = '{}/{}'.format( os.path.abspath(os.path.expanduser(outdir)), 'policy_correlation_map_{}.png'.format(extension)) with open( '{}/{}'.format(os.path.abspath(os.path.expanduser(outdir)), 'results_{}'.format(extension)), 'wb') as file: np.savetxt(file, policy_correlation_grid) plt.tight_layout() plt.grid(False) plt.savefig(output_str)
def visualizer_rllib(args): """Visualizer for RLlib experiments. This function takes args (see function create_parser below for more detailed information on what information can be fed to this visualizer), and renders the experiment associated with it. """ result_dir = args.result_dir if args.result_dir[-1] != '/' \ else args.result_dir[:-1] config = get_rllib_pkl(result_dir) # Run on only one cpu for rendering purposes config['num_workers'] = 0 flow_params = get_flow_params(config) # Determine agent and checkpoint config_run = config['env_config']['run'] if 'run' in config['env_config'] \ else None if args.run and config_run: if args.run != config_run: print('visualizer_rllib.py: error: run argument ' + '\'{}\' passed in '.format(args.run) + 'differs from the one stored in params.json ' + '\'{}\''.format(config_run)) sys.exit(1) if args.run: agent_cls = get_agent_class(args.run) elif config_run: agent_cls = get_agent_class(config_run) else: print('visualizer_rllib.py: error: could not find flow parameter ' '\'run\' in params.json, ' 'add argument --run to provide the algorithm or model used ' 'to train the results\n e.g. ' 'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO') sys.exit(1) sim_params = flow_params['sim'] sim_params.restart_instance = True dir_path = os.path.dirname(os.path.realpath(__file__)) emission_path = '{0}/test_time_rollout/'.format(dir_path) sim_params.emission_path = emission_path if args.gen_emission else None # pick your rendering mode if args.render_mode == 'sumo_web3d': sim_params.num_clients = 2 sim_params.render = False elif args.render_mode == 'drgb': sim_params.render = 'drgb' sim_params.pxpm = 4 elif args.render_mode == 'sumo_gui': sim_params.render = True print('NOTE: With render mode {}, an extra instance of the SUMO GUI ' 'will display before the GUI for visualizing the result. Click ' 'the green Play arrow to continue.'.format(args.render_mode)) elif args.render_mode == 'no_render': sim_params.render = False if args.save_render: sim_params.render = 'drgb' sim_params.pxpm = 4 sim_params.save_render = True # Start the environment with the gui turned on and a path for the # emission file env_params = flow_params['env'] sim_params.restart_instance = False if args.evaluate: env_params.evaluate = True # lower the horizon if testing if args.horizon: config['horizon'] = args.horizon env_params.horizon = args.horizon # Create and register a gym+rllib env create_env, env_name = make_create_env(params=flow_params, version=0) register_env(env_name, create_env) # create the agent that will be used to compute the actions agent = agent_cls(env=env_name, config=config) checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num agent.restore(checkpoint) # Simulate and collect metrics final_outflows = [] final_inflows = [] mean_speed = [] std_speed = [] policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} steps = 0 for i in range(args.num_rollouts): vel = [] mapping_cache = {} # in case policy_agent_mapping is stochastic reward_dict = {} obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and steps < (env_params.horizon or steps + 1): vehicles = env.unwrapped.k.vehicle vel.append(np.mean(vehicles.get_speed(vehicles.get_ids()))) multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, _ = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward steps += 1 obs = next_obs outflow = vehicles.get_outflow_rate(500) final_outflows.append(outflow) inflow = vehicles.get_inflow_rate(500) final_inflows.append(inflow) if np.all(np.array(final_inflows) > 1e-5): throughput_efficiency = [ x / y for x, y in zip(final_outflows, final_inflows) ] else: throughput_efficiency = [0] * len(final_inflows) mean_speed.append(np.mean(vel)) std_speed.append(np.std(vel)) print("Episode reward", reward_total) print('==== Summary of results ====') print(mean_speed) # if multiagent: # for agent_id, rew in rets.items(): # print('For agent', agent_id) # print(rew) # print('Average, std return: {}, {} for agent {}'.format( # np.mean(rew), np.std(rew), agent_id)) # else: # print(rets) # print('Average, std: {}, {}'.format( # np.mean(rets), np.std(rets))) print("\nSpeed, mean (m/s): {}".format(mean_speed)) print('Average, std: {}, {}'.format(np.mean(mean_speed), np.std(mean_speed))) print("\nSpeed, std (m/s): {}".format(std_speed)) print('Average, std: {}, {}'.format(np.mean(std_speed), np.std(std_speed))) # Compute arrival rate of vehicles in the last 500 sec of the run print("\nOutflows (veh/hr): {}".format(final_outflows)) print('Average, std: {}, {}'.format(np.mean(final_outflows), np.std(final_outflows))) # Compute departure rate of vehicles in the last 500 sec of the run print("Inflows (veh/hr): {}".format(final_inflows)) print('Average, std: {}, {}'.format(np.mean(final_inflows), np.std(final_inflows))) # Compute throughput efficiency in the last 500 sec of the print("Throughput efficiency (veh/hr): {}".format(throughput_efficiency)) print('Average, std: {}, {}'.format(np.mean(throughput_efficiency), np.std(throughput_efficiency))) # terminate the environment env.unwrapped.terminate() # if prompted, convert the emission file into a csv file if args.gen_emission: time.sleep(0.1) dir_path = os.path.dirname(os.path.realpath(__file__)) emission_filename = '{0}-emission.xml'.format(env.scenario.name) emission_path = \ '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename) emission_to_csv(emission_path) # if we wanted to save the render, here we create the movie if args.save_render: dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering') # Ignore hidden files dirs = [d for d in dirs if d[0] != '.'] dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S")) recent_dir = dirs[-1] # create the movie movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir save_dir = os.path.expanduser('~') + '/flow_movies' if not os.path.exists(save_dir): os.mkdir(save_dir) os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png" os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4" os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/" os.system(os_cmd)
def sample_actions(rllib_config, checkpoint, num_samples, outdir): env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \ instantiate_rollout(rllib_config, checkpoint) reset_env(env) # figure out how many adversaries you have and initialize their grids num_adversaries = env.num_adv_strengths * env.advs_per_strength adversary_grid_dict = {} for i in range(num_adversaries): adversary_str = 'adversary' + str(i) # each adversary grid is a map of agent action versus observation dimension adversary_grid_dict[adversary_str] = {'sampled_actions': np.zeros((num_samples, env.adv_action_space.shape[0]))} agent_dict = {} agent_dict['agent'] = {'sampled_actions': np.zeros((num_samples, env.action_space.shape[0]))} mapping_cache = {} # in case policy_agent_mapping is stochastic sample_idx = 0 prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) while sample_idx < num_samples: obs = env.reset()['agent'] done = False while not done: if env.kl_reward or (env.l2_reward and not env.l2_memory): multi_obs = {'adversary{}'.format(i): {"obs": obs, "is_active": np.array([1])} for i in range(num_adversaries)} else: multi_obs = {'adversary{}'.format(i): obs for i in range(num_adversaries)} multi_obs['agent'] = obs action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if not p_use_lstm: flat_obs = _flatten_action(a_obs) a_action = agent.compute_action( flat_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) if agent_id != 'agent': adversary_grid_dict[agent_id]['sampled_actions'][sample_idx] = a_action else: agent_dict['agent']['sampled_actions'][sample_idx] = a_action action_dict[agent_id] = a_action new_dict = {} new_dict.update({'agent': action_dict['agent']}) obs, reward, done, info = env.step(new_dict) sample_idx += 1 file_path = os.path.dirname(os.path.abspath(__file__)) output_file_path = os.path.join(file_path, outdir) if not os.path.exists(output_file_path): try: os.makedirs(os.path.dirname(output_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise # Plot the histogram of the actions for adversary, adv_dict in adversary_grid_dict.items(): sampled_actions = adv_dict['sampled_actions'] for action_idx in range(sampled_actions.shape[-1]): fig = plt.figure() plt.hist(sampled_actions[:, action_idx]) output_str = '{}/{}'.format(outdir, adversary + 'action_{}_histogram.png'.format(action_idx)) plt.xlabel('Action magnitude') plt.ylabel('Frequency') plt.title('Histograms of actions over {} sampled obs'.format(num_samples)) plt.savefig(output_str) plt.close(fig) fig = plt.figure() plt.hist2d(sampled_actions[:, 0], sampled_actions[:, 1]) output_str = '{}/{}'.format(outdir, adversary + 'action_2dhistogram.png') plt.xlabel('Action 1 magnitude') plt.ylabel('Action 2 magnitude') plt.title('Histograms of actions over {} sampled obs'.format(num_samples)) plt.savefig(output_str) plt.close(fig) for agent, agent_dict in agent_dict.items(): sampled_actions = agent_dict['sampled_actions'] for action_idx in range(sampled_actions.shape[-1]): fig = plt.figure() plt.hist(sampled_actions[:, action_idx]) output_str = '{}/{}'.format(outdir, agent + 'action_{}_histogram.png'.format(action_idx)) plt.xlabel('Action magnitude') plt.ylabel('Frequency') plt.title('Histograms of actions over {} sampled obs'.format(num_samples)) plt.savefig(output_str) plt.close(fig) fig = plt.figure() plt.hist2d(sampled_actions[:, 0], sampled_actions[:, 1]) output_str = '{}/{}'.format(outdir, agent + 'action_2dhistogram.png') plt.xlabel('Action 1 magnitude') plt.ylabel('Action 2 magnitude') plt.title('Histograms of actions over {} sampled obs'.format(num_samples)) plt.savefig(output_str) plt.close(fig)
def _env_runner(async_vector_env, extra_batch_callback, policies, policy_mapping_fn, unroll_length, horizon, obs_filters, clip_rewards, pack, tf_sess=None): """This implements the common experience collection logic. Args: async_vector_env (AsyncVectorEnv): env implementing AsyncVectorEnv. extra_batch_callback (fn): function to send extra batch data to. policies (dict): Map of policy ids to PolicyGraph instances. policy_mapping_fn (func): Function that maps agent ids to policy ids. This is called when an agent first enters the environment. The agent is then "bound" to the returned policy for the episode. unroll_length (int): Number of episode steps before `SampleBatch` is yielded. Set to infinity to yield complete episodes. horizon (int): Horizon of the episode. obs_filters (dict): Map of policy id to filter used to process observations for the policy. clip_rewards (bool): Whether to clip rewards before postprocessing. pack (bool): Whether to pack multiple episodes into each batch. This guarantees batches will be exactly `unroll_length` in size. tf_sess (Session|None): Optional tensorflow session to use for batching TF policy evaluations. Yields: rollout (SampleBatch): Object containing state, action, reward, terminal condition, and other fields as dictated by `policy`. """ try: if not horizon: horizon = ( async_vector_env.get_unwrapped()[0].spec.max_episode_steps) except Exception: logger.warn("no episode horizon specified, assuming inf") if not horizon: horizon = float("inf") # Pool of batch builders, which can be shared across episodes to pack # trajectory data. batch_builder_pool = [] def get_batch_builder(): if batch_builder_pool: return batch_builder_pool.pop() else: return MultiAgentSampleBatchBuilder(policies, clip_rewards) def new_episode(): return MultiAgentEpisode(policies, policy_mapping_fn, get_batch_builder, extra_batch_callback) active_episodes = defaultdict(new_episode) while True: # Get observations from all ready agents unfiltered_obs, rewards, dones, infos, off_policy_actions = \ async_vector_env.poll() # Map of policy_id to list of PolicyEvalData to_eval = defaultdict(list) # Map of env_id -> agent_id -> action replies actions_to_send = defaultdict(dict) # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True atari_metrics = _fetch_atari_metrics(async_vector_env) if atari_metrics is not None: for m in atari_metrics: yield m else: yield RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards)) else: all_done = False # At least send an empty dict if not done actions_to_send[env_id] = {} # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) # Record transition info if applicable if last_observation is not None and \ infos[env_id][agent_id].get("training_enabled", True): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=agent_done, infos=infos[env_id][agent_id], new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: yield episode.batch_builder.build_and_reset(episode) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) del active_episodes[env_id] resetted_obs = async_vector_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list assert horizon == float("inf"), \ "Setting episode horizon requires reset() support." else: # Creates a new episode episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action( policy.action_space.sample())), 0.0)) # Batch eval policy actions if possible if tf_sess: builder = TFRunBuilder(tf_sess, "policy_eval") pending_fetches = {} else: builder = None eval_results = {} rnn_in_cols = {} for policy_id, eval_data in to_eval.items(): rnn_in = _to_column_format([t.rnn_state for t in eval_data]) rnn_in_cols[policy_id] = rnn_in policy = _get_or_raise(policies, policy_id) if builder and (policy.compute_actions.__code__ is TFPolicyGraph.compute_actions.__code__): pending_fetches[policy_id] = policy.build_compute_actions( builder, [t.obs for t in eval_data], rnn_in, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data], is_training=True) else: eval_results[policy_id] = policy.compute_actions( [t.obs for t in eval_data], rnn_in, prev_action_batch=[t.prev_action for t in eval_data], prev_reward_batch=[t.prev_reward for t in eval_data], is_training=True, episodes=[active_episodes[t.env_id] for t in eval_data]) if builder: for k, v in pending_fetches.items(): eval_results[k] = builder.get(v) # Record the policy eval results for policy_id, eval_data in to_eval.items(): actions, rnn_out_cols, pi_info_cols = eval_results[policy_id] if len(rnn_in_cols[policy_id]) != len(rnn_out_cols): raise ValueError( "Length of RNN in did not match RNN out, got: " "{} vs {}".format(rnn_in_cols[policy_id], rnn_out_cols)) # Add RNN state info for f_i, column in enumerate(rnn_in_cols[policy_id]): pi_info_cols["state_in_{}".format(f_i)] = column for f_i, column in enumerate(rnn_out_cols): pi_info_cols["state_out_{}".format(f_i)] = column # Save output rows actions = _unbatch_tuple_actions(actions) for i, action in enumerate(actions): env_id = eval_data[i].env_id agent_id = eval_data[i].agent_id actions_to_send[env_id][agent_id] = action episode = active_episodes[env_id] episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols]) episode._set_last_pi_info( agent_id, {k: v[i] for k, v in pi_info_cols.items()}) if env_id in off_policy_actions and \ agent_id in off_policy_actions[env_id]: episode._set_last_action( agent_id, off_policy_actions[env_id][agent_id]) else: episode._set_last_action(agent_id, action) # Return computed actions to ready envs. We also send to envs that have # taken off-policy actions; those envs are free to ignore the action. async_vector_env.send_actions(dict(actions_to_send))
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, video_dir=None): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} else: env = gym.make(env_name) multiagent = False try: policy_map = {DEFAULT_POLICY_ID: agent.policy} except AttributeError: raise AttributeError( "Agent ({}) does not have a `policy` property! This is needed " "for performing (trained) agent rollouts.".format(agent)) use_lstm = {DEFAULT_POLICY_ID: False} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } # If monitoring has been requested, manually wrap our environment with a # gym monitor, which is set to record every episode. if video_dir: env = gym.wrappers.Monitor(env=env, directory=video_dir, video_callable=lambda x: True, force=True) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 episode_steps = 0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) episode_steps += 1 if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {} steps: {}".format( episodes, reward_total, episode_steps)) if done: episodes += 1
def _process_observations(worker, base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, rollout_fragment_length, pack, callbacks, soft_horizon, no_done_at_end): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] large_batch_threshold = max(1000, rollout_fragment_length * 10) if \ rollout_fragment_length != float("inf") else 5000 # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > large_batch_threshold and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly and that" " it terminates at some point. " "Note: In multi-agent environments, `rollout_fragment_length` " "sets the batch size based on environment steps, not the " "steps of " "individual agents, which can result in unexpectedly large " "batches. Also, you may be in evaluation waiting for your Env " "to terminate (batch_mode=`complete_episodes`). Make sure it " "does at some point.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {}, episode.hist_data)) else: hit_horizon = False all_done = False active_envs.add(env_id) # For each agent in the environment. for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=(False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode callbacks.on_episode_step( worker=worker, base_env=base_env, episode=episode) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_agent_data(): if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= rollout_fragment_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) # Call each policy's Exploration.on_episode_end method. for p in policies.values(): p.exploration.on_episode_end( policy=p, environment=base_env, episode=episode, tf_sess=getattr(p, "_sess", None)) # Call custom on_episode_end callback. callbacks.on_episode_end( worker=worker, base_env=base_env, policies=policies, episode=episode) if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs = agent_obs else: del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") elif resetted_obs != ASYNC_RESET_RETURN: # Creates a new episode if this is not async return # If reset is async, we will get its result in some future poll episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def _process_observations(base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks, soft_horizon, no_done_at_end): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ global i global tmp_dic global traffic_light_node_dict i += 1 def inter_num_2_id(num): return list(tmp_dic.keys())[list(tmp_dic.values()).index(num)] def read_traffic_light_node_dict(): path_to_read = os.path.join(record_dir, 'traffic_light_node_dict.conf') with open(path_to_read, 'r') as f: traffic_light_node_dict = eval(f.read()) print("Read traffic_light_node_dict") return traffic_light_node_dict if i <= 1: # 此处用于从配置文件读入 neighbor 情况 record_dir = base_env.envs[0].record_dir traffic_light_node_dict = base_env.envs[0].traffic_light_node_dict tmp_dic = traffic_light_node_dict['intersection_1_1'][ 'inter_id_to_index'] active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > max(1000, unroll_length * 10) and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly. Note " "that in multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " "batches.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {})) else: hit_horizon = False all_done = False active_envs.add(env_id) # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) # eg: "policy_0" # print(policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) ''' For Attention !!!!!!!!!!!!!!!!!!!! 这里要执行的是实时的Q eval, 因此要Q eval 网络传neighbor_obs值 ''' # 根据 traffic_light_node_dict 字典中的路网关系, 找到当前 policy_id 的 neighbor, 并保存成 "policy_0" 的形式 neighbor_pid_list = [ 'policy_{}'.format(pid_) for pid_ in traffic_light_node_dict[inter_num_2_id( int(policy_id.split('_')[1]))]['adjacency_row'] if pid_ != None ] # print(neighbor_pid_list) neighbor_obs = [] neighbor_obs.append([]) # Size: (1, 5, 15) 只有这个形式才能传入neighbor_obs (batch, 5, 15) 的 Placeholder i = 0 for neighbor_id in neighbor_pid_list: neighbor_prep_obs = _get_or_raise( preprocessors, neighbor_id).transform(raw_obs) neighbor_filtered_obs = _get_or_raise( obs_filters, neighbor_id)(neighbor_prep_obs) neighbor_obs[0].append(neighbor_filtered_obs) i += 1 neighbor_obs = np.array(neighbor_obs).reshape( (len(neighbor_pid_list), len(raw_obs))) # (5, 29) # ------------------------------------------------------------------ if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, neighbor_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=(False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({"env": base_env, "episode": episode}) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, "policy": policies, "episode": episode }) if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs = agent_obs else: del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") elif resetted_obs != ASYNC_RESET_RETURN: # Creates a new episode if this is not async return # If reset is async, we will get its result in some future poll episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) # eg: "policy_0" policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) # print('policy_id' + str(policy_id)) # print('filtered_obs' + str(filtered_obs)) ''' For Attention !!!!!!!!!!!!!!!!!!!! 这里是episode终止, create a new episode 这里要执行的是实时的Q eval, 因此要Q eval 网络传neighbor_obs值 ''' # 根据 traffic_light_node_dict 字典中的路网关系, 找到当前 policy_id 的 neighbor, 并保存成 "policy_0" 的形式 neighbor_pid_list = [ 'policy_{}'.format(pid_) for pid_ in traffic_light_node_dict[inter_num_2_id( int(policy_id.split('_')[1]))]['adjacency_row'] if pid_ != None ] # print(neighbor_pid_list) neighbor_obs = [] neighbor_obs.append([]) # Size: (1, 5, 29) 只有这个形式才能传入neighbor_obs (batch, 5, 17) 的 Placeholder i = 0 for neighbor_id in neighbor_pid_list: neighbor_prep_obs = _get_or_raise( preprocessors, neighbor_id).transform(raw_obs) neighbor_filtered_obs = _get_or_raise( obs_filters, neighbor_id)(neighbor_prep_obs) neighbor_obs[0].append(neighbor_filtered_obs) i += 1 neighbor_obs = np.squeeze(np.array(neighbor_obs)) # ------------------------------------------------------------------ episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, neighbor_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, monitor=False): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if monitor and not no_render and saver and saver.outfile is not None: # If monitoring has been requested, # manually wrap our environment with a gym monitor # which is set to record every episode. env = gym.wrappers.Monitor( env, os.path.join(os.path.dirname(saver.outfile), "monitor"), lambda x: True) steps = 0 episodes = 0 if not no_render: shape = env.base_map.shape full_obs = [ np.zeros((shape[0], shape[1], 3), dtype=np.uint8) for i in range(num_steps * num_episodes) ] while episodes < num_episodes: mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 intrinsic_total = 0.0 env_total = 0.0 out_file = open('videos/communication_log.txt', 'w') out_file.write(f'\n\n episode-{episodes} \n\n') while not done and steps < num_steps: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) intrinsic_total += sum([f['intrinsic'] for f in info.values()]) env_total += sum([f['environmental'] for f in info.values()]) else: reward_total += reward if not no_render: # env.render() rgb_arr = env.map_to_colors() full_obs[steps + (num_steps * episodes)] = rgb_arr.astype( np.uint8) out_file.write(f'step-{steps}: {action}\n') saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs out_file.close() saver.end_rollout() print("Episode #{}: reward: {}, intrinsic: {}, env: {}".format( episodes, reward_total, intrinsic_total, env_total)) episodes += 1 steps = 0 if not no_render: path = os.path.abspath(os.path.dirname(__file__)) + '/videos' print('saving video to ', path) if not os.path.exists(path): os.makedirs(path) images_path = path + '/images/' if not os.path.exists(images_path): os.makedirs(images_path) utility_funcs.make_video_from_rgb_imgs(full_obs, path) # Clean up images shutil.rmtree(images_path)
def rollout(agent, env_name, num_steps, num_episodes=0, saver=RolloutSaver(), no_render=True, monitor=False): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if monitor and not no_render and saver and saver.outfile is not None: # If monitoring has been requested, # manually wrap our environment with a gym monitor # which is set to record every episode. env = gym.wrappers.Monitor( env, os.path.join(os.path.dirname(saver.outfile), "monitor"), lambda x: True) steps = 0 episodes = 0 ##################################################################### start Roman MeshStates = #initialize Mesh import cpvpy as cp # import function to calculate distance from .state_distance_calculator.py import is_distance_threshold_exceeded # import function to calculate distance ##################################################################### end Roman while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 ################################################################# start Roman number_of_remaining_simulation_steps = 20 ########### amount of simulation steps we want to run until we log into our state trasition matrix [(1 dt)= (1 step in simulation) = (env.instance.emv.scene.dt=0.0165) = (env_instance.env.scene.frameskip = 4) * (env_instance.env.scene.timestep = 0.004125) ] ################################################################# end Roman while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) number_of_remaining_simulation_steps -= 1 print("RLLibStates: ") print("Observation: ", next_obs) print("Observation type: ", type(next_obs)) print("Observation Shape: ", next_obs.shape) if done: ############################ it is time to log into our mesh, my dudes if number_of_remaining_simulation_steps > 0: ##### if we failed #increment failure state (state 0, or whatever we call it) pass else: #### if the episode continued loger than our threshold of simulation steps #increment current state pass if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
def rollout(agent, env_name, num_steps, out=None, no_render=True, render_q=False, save_q=False): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if out is not None: rollouts = [] steps = 0 while steps < (num_steps or steps + 1): mapping_cache = {} # in case policy_agent_mapping is stochastic if out is not None: rollout = [] obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action # Custom code for getting Q values #q = get_q_value(env, agent, policy_id, a_obs) #print("Q",q) action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] print(action) next_obs, reward, done, _ = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward #if not no_render: frame = env.render(width=RENDER_WIDTH, height=RENDER_HEIGHT) bgr = cv2.cvtColor(frame[:,:,:3], cv2.COLOR_RGB2BGR) video.write(bgr) if out is not None: rollout.append([obs, action, next_obs, reward, done]) steps += 1 obs = next_obs if out is not None: rollouts.append(rollout) print("Episode reward", reward_total)
def _process_observations(base_env, policies, policies_to_train, dead_policies, policy_config, observation_filter, tf_sess, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks, soft_horizon, no_done_at_end): #===MOD=== """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > max(1000, unroll_length * 10) and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly. Note " "that in multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " "batches.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: # DEBUG # print("Trying to terminate.") # print("Dones of __all__ is set:", dones[env_id]["__all__"]) # print("Horizon hit:", episode.length >= horizon) hit_horizon = (episode.length >= horizon and not dones[env_id]["__all__"]) all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics, {})) else: hit_horizon = False all_done = False active_envs.add(env_id) #===MOD=== additional_builders_ids = set() #===MOD=== # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): #===MOD=== policy_id, policy_constructor_tuple = episode.policy_for(agent_id) pols_tuple = generate_policies( policy_id, policy_constructor_tuple, policies, policies_to_train, dead_policies, policy_config, preprocessors, obs_filters, observation_filter, tf_sess, ) policies, preprocessors, obs_filters, policies_to_train, dead_policies = pols_tuple #===MOD=== prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): #===MOD=== additional_builders_ids.add(agent_id) #===MOD=== episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=(False if (no_done_at_end or (hit_horizon and soft_horizon)) else agent_done), infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) #===MOD=== if agent_done: # Does it make sense to remove agent id from `agent_builders`? dead_policies.add(policy_id) print("Removing agent id from agent builders: %s" % str(agent_id)) episode.batch_builder.agent_builders.pop(agent_id) if policy_id in to_eval: to_eval.pop(policy_id) # print("Popping policy id from toeval.") #===MOD=== start = time.time() #===MOD=== print("sampler.py: ids added to agent builders:\t", additional_builders_ids) # Update ``self.policy_map`` in ``MultiAgentSampleBatchBuilder``. # TODO: policies is not being pruned in this file. episode.batch_builder.policy_map = policies print("sampler.py: policies: \t", policies.keys()) #===MOD=== # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({"env": base_env, "episode": episode}) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode # KEYERROR episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, "policy": policies, "episode": episode }) if hit_horizon and soft_horizon: episode.soft_reset() resetted_obs = agent_obs else: del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") elif resetted_obs != ASYNC_RESET_RETURN: # print("Executing new epsiode non-async return.") time.sleep(1) raise NotImplementedError( "Multiple episodes not supported by design.") # Creates a new episode if this is not async return # If reset is async, we will get its result in some future poll episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): #===MOD=== policy_id, policy_constructor_tuple = episode.policy_for( agent_id) # with tf_sess.as_default(): pols_tuple = generate_policies( policy_id, policy_constructor_tuple, policies, policies_to_train, dead_policies, policy_config, preprocessors, obs_filters, observation_filter, tf_sess, ) policies, preprocessors, obs_filters, policies_to_train, dead_policies = pols_tuple #===MOD=== policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) #===MOD=== pols_tuple = (policies, preprocessors, obs_filters, policies_to_train, dead_policies) #===MOD=== #===MOD=== return active_envs, to_eval, outputs, pols_tuple
def rollout(agent, env, env_name, num_steps=None, require_frame=False, require_trajectory=False, require_extra_info=False, require_full_frame=False, require_env_state=False, render_mode="rgb_array", num_rollouts=1, multiagent_environment=False): assert require_frame or require_trajectory or require_extra_info or \ require_env_state, "You must ask for some output!" # if num_steps is None: # num_steps = 3000 policy_agent_mapping = default_policy_agent_mapping if isinstance(agent, SymbolicAgentBase): agent = agent.get()['agent'] print("Successfully restore agent at remote worker " "from symbolic agent!") if hasattr(agent, "workers"): # env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: # env = gym.make(env_name) multiagent = multiagent_environment policy = agent.policy state_init = {DEFAULT_POLICY_ID: None} use_lstm = {p: None for p, s in state_init.items()} action_init = {DEFAULT_POLICY_ID: policy.action_space.sample()} steps = 0 now = time.time() start = now result_list = [] # while steps < (num_steps or steps + 1): for i in range(num_rollouts): if require_trajectory: trajectory = [] if require_frame: frames = [] # assert env_name in ["BipedalWalker-v2"] frame_extra_info = { "value_function": [], "reward": [], "done": [], "step": [], "period_info": [] } if require_extra_info: extra_infos = [] if require_env_state: env_states = [] mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() if require_env_state: env_states.append(copy.deepcopy(env.get_state_wrap())) agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 if not multiagent else {} while not done and steps < (num_steps or steps + 1): if steps % LOG_INTERVAL_STEPS == (LOG_INTERVAL_STEPS - 1): logging.info("Current Steps: {}, Time Elapsed: {:.2f}s, " "Last {} Steps Time: {:.2f}s".format( steps, time.time() - start, LOG_INTERVAL_STEPS, time.time() - now)) now = time.time() multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} value_functions = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, a_info = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: # This is a workaround if "ES" in agent._name: a_action = agent.compute_action(a_obs) a_info = {} else: a_action, _, a_info = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id, full_fetch=True) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action if require_extra_info: extra_infos.append(a_info) # This is a work around if "ES" not in agent._name: value_functions[agent_id] = a_info["vf_preds"] # This is a work around if require_frame and ("ES" not in agent._name): frame_extra_info['value_function'].append( value_functions[_DUMMY_AGENT_ID]) if multiagent_environment: action = action_dict else: action = action_dict[_DUMMY_AGENT_ID] next_obs, reward, raw_done, _ = env.step(action) if multiagent: done = raw_done["__all__"] for rewk, rewv in reward.items(): if rewk not in reward_total: reward_total[rewk] = 0.0 reward_total[rewk] += rewv # # # reward_total += sum(reward.values()) else: done = raw_done reward_total += reward if require_frame: frame_extra_info["done"].append(raw_done) frame_extra_info["reward"].append(copy.deepcopy(reward_total)) frame_extra_info["step"].append(steps) # data required for calculating period. # we observe the channel 7 and 12 which represent the speed # of the knee joints. # This only hold for BipedalWalker-v2. if env_name in ENV_NAME_PERIOD_FEATURE_LOOKUP.keys(): assert obs.ndim == 1 period_feature = ENV_NAME_PERIOD_FEATURE_LOOKUP[env_name] frame_extra_info["period_info"].append(obs[period_feature]) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward kwargs = {"mode": render_mode if require_full_frame else "cropped"} # This copy() is really important! # Otherwise see error: pyarrow.lib.ArrowInvalid if require_frame: frame = env.render(**kwargs).copy() frames.append(frame) if require_trajectory: trajectory.append([obs, action, next_obs, reward, done]) if require_env_state: env_states.append(copy.deepcopy(env.get_state_wrap())) steps += 1 obs = next_obs logging.info("Episode reward", reward_total) result = {} if require_frame: result['frames'] = np.stack(frames) result['frame_extra_info'] = frame_extra_info if require_trajectory: result['trajectory'] = trajectory if require_extra_info: extra_info_dict = {k: [] for k in extra_infos[0].keys()} for item in extra_infos: for k, v in item.items(): extra_info_dict[k].append(v) result["extra_info"] = extra_info_dict if require_env_state: result['env_states'] = env_states result_list.append(result) if num_rollouts == 1: return result_list[0] return result_list
def run_rollout(env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init, num_rollouts, render, adv_num=None): rewards = [] step_nums = [] # actually do the rollout for r_itr in range(num_rollouts): mapping_cache = {} # in case policy_agent_mapping is stochastic agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) if adv_num: env.curr_adversary = adv_num obs = env.reset() prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 step_num = 0 while not done: step_num += 1 if adv_num is not None: multi_obs = { 'agent': obs['agent'], 'adversary{}'.format(adv_num): obs['agent'] } else: multi_obs = { 'agent': obs['agent'] } if multiagent else { _DUMMY_AGENT_ID: obs } action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: prev_action = _flatten_action(prev_actions[agent_id]) a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_action, prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: prev_action = _flatten_action(prev_actions[agent_id]) a_action = agent.compute_action( a_obs, prev_action=prev_action, prev_reward=prev_rewards[agent_id], policy_id=policy_id) # handle the tuple case if len(a_action) > 1: if isinstance(a_action[0], np.ndarray): a_action[0] = a_action[0].flatten() action_dict[agent_id] = a_action prev_action = _flatten_action(a_action) # tuple actions prev_actions[agent_id] = prev_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] if adv_num is not None: action = { 'agent': action['agent'], 'adversary0': action['adversary{}'.format(adv_num)] } # we turn the adversaries off so you only send in the pendulum keys next_obs, reward, done, info = env.step(action) if render: env.render() if isinstance(done, dict): done = done['__all__'] if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # we only want the robot reward, not the adversary reward reward_total += info['agent']['agent_reward'] obs = next_obs print("Episode reward", reward_total) rewards.append(reward_total) step_nums.append(step_num) env.close() print('the average reward is ', np.mean(rewards)) return rewards, step_num
def _process_observations(base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) global _large_batch_warned if (not _large_batch_warned and episode.batch_builder.total() > max(1000, unroll_length * 10)): _large_batch_warned = True logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly. Note " "that in multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " "batches.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics)) else: all_done = False active_envs.add(env_id) # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=agent_done, infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({"env": base_env, "episode": episode}) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if dones[env_id]["__all__"]: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, "policy": policies, "episode": episode }) del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") else: # Creates a new episode episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def rollout(agent, env_name, num_steps, num_episodes=0, saver=None, no_render=True, monitor=False): policy_agent_mapping = default_policy_agent_mapping if saver is None: saver = RolloutSaver() if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { #p: flatten_to_single_ndarray(m.action_space.sample()) # ray 0.8.5 p: _flatten_action(m.action_space.sample()) # ray 0.8.4 for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if monitor and not no_render and saver and saver.outfile is not None: # If monitoring has been requested, # manually wrap our environment with a gym monitor # which is set to record every episode. env = gym.wrappers.Monitor( env, os.path.join(os.path.dirname(saver.outfile), "monitor"), lambda x: True) steps = 0 episodes = 0 simulation_rewards = [] simulation_rewards_normalized = [] simulation_percentage_complete = [] simulation_steps = [] while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 episode_steps = 0 episode_max_steps = 0 episode_num_agents = 0 agents_score = collections.defaultdict(lambda: 0.) agents_done = set() while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) #a_action = flatten_to_single_ndarray(a_action) # ray 0.8.5 a_action = _flatten_action(a_action) # tuple actions # ray 0.8.4 action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs for agent_id, agent_info in info.items(): if episode_max_steps == 0: episode_max_steps = agent_info["max_episode_steps"] episode_num_agents = agent_info["num_agents"] episode_steps = max(episode_steps, agent_info["agent_step"]) agents_score[agent_id] = agent_info["agent_score"] if agent_info["agent_done"]: agents_done.add(agent_id) episode_score = sum(agents_score.values()) simulation_rewards.append(episode_score) simulation_rewards_normalized.append(episode_score / (episode_max_steps + episode_num_agents)) simulation_percentage_complete.append(float(len(agents_done)) / episode_num_agents) simulation_steps.append(episode_steps) saver.end_rollout() print(f"Episode #{episodes}: " f"score: {episode_score:.2f} " f"({np.mean(simulation_rewards):.2f}), " f"normalized score: {simulation_rewards_normalized[-1]:.2f} " f"({np.mean(simulation_rewards_normalized):.2f}), " f"percentage_complete: {simulation_percentage_complete[-1]:.2f} " f"({np.mean(simulation_percentage_complete):.2f})") if done: episodes += 1 print("Evaluation completed:\n" f"Episodes: {episodes}\n" f"Mean Reward: {np.round(np.mean(simulation_rewards))}\n" f"Mean Normalized Reward: {np.round(np.mean(simulation_rewards_normalized))}\n" f"Mean Percentage Complete: {np.round(np.mean(simulation_percentage_complete), 3)}\n" f"Mean Steps: {np.round(np.mean(simulation_steps), 2)}") return { 'reward': [float(r) for r in simulation_rewards], 'reward_mean': np.mean(simulation_rewards), 'reward_std': np.std(simulation_rewards), 'normalized_reward': [float(r) for r in simulation_rewards_normalized], 'normalized_reward_mean': np.mean(simulation_rewards_normalized), 'normalized_reward_std': np.std(simulation_rewards_normalized), 'percentage_complete': [float(c) for c in simulation_percentage_complete], 'percentage_complete_mean': np.mean(simulation_percentage_complete), 'percentage_complete_std': np.std(simulation_percentage_complete), 'steps': [float(c) for c in simulation_steps], 'steps_mean': np.mean(simulation_steps), 'steps_std': np.std(simulation_steps), }
def rollout(agent, env_name, num_steps, no_render=True): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} steps = 0 while steps < (num_steps or steps + 1): mapping_cache = {} # in case policy_agent_mapping is stochastic obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, _ = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() steps += 1 obs = next_obs print("Episode reward", reward_total)
def _process_observations(async_vector_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, obs_filters, unroll_length, pack, callbacks): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True atari_metrics = _fetch_atari_metrics(async_vector_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics)) else: all_done = False active_envs.add(env_id) # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) # Record transition info if applicable if last_observation is not None and \ infos[env_id][agent_id].get("training_enabled", True): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=agent_done, infos=infos[env_id][agent_id], new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({ "env": async_vector_env, "episode": episode }) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": async_vector_env, "episode": episode }) del active_episodes[env_id] resetted_obs = async_vector_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") else: # Creates a new episode episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def rollout(agent, env_name, num_steps, out=None, no_render=False, irfs=True, noplot=False, num_episodes=1): Deltas = [] Del_irf = [] Obs_irf = [] policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if out is not None: rollouts = [] episode = 0 while episode < num_episodes: print(f'Episode {episode} of {num_episodes}') mapping_cache = {} # in case policy_agent_mapping is stochastic if out is not None: rollout = [] obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 steps = 0 deltas = [] while not done and steps < (num_steps or steps + 1): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) #print(f'Step: {env.local_steps}, action: {action} info: {info}') deltas.append([info['agent_0']['delta'],info['agent_1']['delta']]) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() if out is not None: rollout.append([obs, action, next_obs, reward, done]) steps += 1 obs = next_obs if out is not None: rollouts.append(rollout) print("Episode reward", reward_total) if noplot == False: plt.plot(deltas) plt.show() sns.kdeplot(deltas, shade=True, cbar=True, cmap='Blues') plt.show() # === Code for impulse response functions === if irfs == True: del_irf = [] obs_irf = [] del_irf.append([info['agent_0']['delta'],info['agent_1']['delta']]) obs_irf.append(obs['agent_0']) for count in range(100): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions if count < 2 and agent_id=='agent_0': action_dict[agent_id] = 0 prev_actions[agent_id] = 0 else: action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) del_irf.append([info['agent_0']['delta'],info['agent_1']['delta']]) obs_irf.append(obs['agent_0']) if noplot==False: plt.plot(del_irf) plt.show() plt.plot(obs_irf) plt.show() episode += 1 Deltas.append(deltas) Del_irf.append(del_irf) Obs_irf.append(obs_irf) if out is not None: pickle.dump(rollouts, open(out, "wb")) return Deltas, Del_irf, Obs_irf
def visualize_adversaries(rllib_config, checkpoint, grid_size, num_rollouts, outdir): env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \ instantiate_rollout(rllib_config, checkpoint) # figure out how many adversaries you have and initialize their grids num_adversaries = env.num_adversaries adversary_grid_dict = {} kl_grid = np.zeros((num_adversaries, num_adversaries)) for i in range(num_adversaries): adversary_str = 'adversary' + str(i) # each adversary grid is a map of agent action versus observation dimension adversary_grid = np.zeros((grid_size - 1, grid_size - 1, env.observation_space.low.shape[0], env.adv_action_space.low.shape[0])).astype(int) strength_grid = np.linspace(env.adv_action_space.low, env.adv_action_space.high, grid_size).T obs_grid = np.linspace(env.observation_space.low, env.observation_space.high, grid_size).T adversary_grid_dict[adversary_str] = {'grid': adversary_grid, 'action_bins': strength_grid, 'obs_bins': obs_grid, 'action_list': []} total_steps = 0 # env.should_render = True # actually do the rollout for r_itr in range(num_rollouts): print('On iteration {}'.format(r_itr)) mapping_cache = {} # in case policy_agent_mapping is stochastic agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) obs = env.reset() prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 step_num = 0 while not done: multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} obs = multi_obs['agent'] * env.obs_norm if isinstance(env.adv_observation_space, dict): multi_obs = {'adversary{}'.format(i): {'obs': obs, 'is_active': np.array([1])} for i in range(env.num_adversaries)} else: multi_obs = {'adversary{}'.format(i): obs for i in range(env.num_adversaries)} multi_obs.update({'agent': obs}) action_dict = {} action_dist_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) policy = agent.get_policy(policy_id) p_use_lstm = use_lstm[policy_id] if p_use_lstm: prev_action = _flatten_action(prev_actions[agent_id]) a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_action, prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state if isinstance(a_obs, dict): flat_obs = np.concatenate([val for val in a_obs.values()])[np.newaxis, :] else: flat_obs = _flatten_action(a_obs)[np.newaxis, :] logits, _ = policy.model.from_batch({"obs": flat_obs, "prev_action": prev_action}) else: if isinstance(a_obs, dict): flat_obs = np.concatenate([val for val in a_obs.values()])[np.newaxis, :] else: flat_obs = _flatten_action(a_obs)[np.newaxis, :] logits, _ = policy.model.from_batch({"obs": flat_obs}) prev_action = _flatten_action(prev_actions[agent_id]) flat_action = _flatten_action(a_obs) a_action = agent.compute_action( flat_action, prev_action=prev_action, prev_reward=prev_rewards[agent_id], policy_id=policy_id) # handle the tuple case if len(a_action) > 1: if isinstance(a_action[0], np.ndarray): a_action[0] = a_action[0].flatten() action_dict[agent_id] = a_action action_dist_dict[agent_id] = DiagGaussian(logits, None) prev_action = _flatten_action(a_action) # tuple actions prev_actions[agent_id] = prev_action # Now store the agent action in the corresponding grid if agent_id != 'agent': action_bins = adversary_grid_dict[agent_id]['action_bins'] obs_bins = adversary_grid_dict[agent_id]['obs_bins'] heat_map = adversary_grid_dict[agent_id]['grid'] for action_loop_index, action in enumerate(a_action): adversary_grid_dict[agent_id]['action_list'].append(a_action[0]) action_index = np.digitize(action, action_bins[action_loop_index, :]) - 1 # digitize will set the right edge of the box to the wrong value if action_index == heat_map.shape[0]: action_index -= 1 for obs_loop_index, obs_elem in enumerate(obs): obs_index = np.digitize(obs_elem, obs_bins[obs_loop_index, :]) - 1 if obs_index == heat_map.shape[1]: obs_index -= 1 heat_map[action_index, obs_index, obs_loop_index, action_loop_index] += 1 for agent_id in multi_obs.keys(): if agent_id != 'agent': # Now iterate through the agents and compute the kl_diff curr_id = int(agent_id.split('adversary')[1]) your_action_dist = action_dist_dict[agent_id] # mean, log_std = np.split(your_logits[0], 2) for i in range(num_adversaries): # KL diff of something with itself is zero if i == curr_id: pass # otherwise just compute the kl difference between the agents else: other_action_dist = action_dist_dict['adversary{}'.format(i)] # other_mean, other_log_std = np.split(other_logits.numpy()[0], 2) kl_diff = your_action_dist.kl(other_action_dist) kl_grid[curr_id, i] += kl_diff action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] # we turn the adversaries off so you only send in the pendulum keys new_dict = {} new_dict.update({'agent': action['agent']}) next_obs, reward, done, info = env.step(new_dict) if isinstance(done, dict): done = done['__all__'] step_num += 1 if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward # we only want the robot reward, not the adversary reward reward_total += info['agent']['agent_reward'] obs = next_obs total_steps += step_num file_path = os.path.dirname(os.path.abspath(__file__)) output_file_path = os.path.join(file_path, outdir) if not os.path.exists(output_file_path): try: os.makedirs(os.path.dirname(output_file_path)) except OSError as exc: if exc.errno != errno.EEXIST: raise # Plot the heatmap of the actions for adversary, adv_dict in adversary_grid_dict.items(): heat_map = adv_dict['grid'] action_bins = adv_dict['action_bins'] obs_bins = adv_dict['obs_bins'] action_list = adv_dict['action_list'] plt.figure() sns.distplot(action_list) output_str = '{}/{}'.format(outdir, adversary + 'action_histogram.png') plt.savefig(output_str) # x_label, y_label = env.transform_adversary_actions(bins) # ax = sns.heatmap(heat_map, annot=True, fmt="d") xtitles = ['x', 'xdot', 'theta', 'thetadot'] ytitles = ['ax', 'ay'] for obs_idx in range(heat_map.shape[-2]): for a_idx in range(heat_map.shape[-1]): plt.figure() # increasing the row index implies moving down on the y axis sns.heatmap(heat_map[:, :, obs_idx, a_idx], yticklabels=np.round(action_bins[0], 1), xticklabels=np.round(obs_bins[i], 1)) plt.ylabel(ytitles[a_idx]) plt.xlabel(xtitles[obs_idx]) output_str = '{}/{}'.format(outdir, adversary + 'action_heatmap_{}_{}.png'.format(xtitles[obs_idx], ytitles[a_idx])) plt.savefig(output_str) # Plot the kl difference between agents plt.figure() sns.heatmap(kl_grid / total_steps) output_str = '{}/{}'.format(outdir, 'kl_heatmap.png') plt.savefig(output_str)
def rollout(agent, env_name, num_steps, num_episodes=0, saver=RolloutSaver(), no_render=True, monitor=False): policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if monitor and not no_render and saver and saver.outfile is not None: # If monitoring has been requested, # manually wrap our environment with a gym monitor # which is set to record every episode. env = gym.wrappers.Monitor( env, os.path.join(os.path.dirname(saver.outfile), "monitor"), lambda x: True) steps = 0 episodes = 0 while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic saver.begin_rollout() obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward if not no_render: env.render() saver.append_step(obs, action, next_obs, reward, done, info) steps += 1 obs = next_obs saver.end_rollout() print("Episode #{}: reward: {}".format(episodes, reward_total)) if done: episodes += 1
def run_bottleneck(args, inflow_rate, num_trials): result_dir = args.result_dir if args.result_dir[-1] != '/' \ else args.result_dir[:-1] config = get_rllib_pkl(result_dir) # Run on only one cpu for rendering purposes config['num_workers'] = 0 flow_params = get_flow_params(config) # Determine agent and checkpoint config_run = config['env_config']['run'] if 'run' in config['env_config'] \ else None if args.run and config_run: if args.run != config_run: print('visualizer_rllib.py: error: run argument ' + '\'{}\' passed in '.format(args.run) + 'differs from the one stored in params.json ' + '\'{}\''.format(config_run)) sys.exit(1) if args.run: agent_cls = get_agent_class(args.run) elif config_run: agent_cls = get_agent_class(config_run) else: print('visualizer_rllib.py: error: could not find flow parameter ' '\'run\' in params.json, ' 'add argument --run to provide the algorithm or model used ' 'to train the results\n e.g. ' 'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO') sys.exit(1) # if using a custom model if config['model']['custom_model'] == "cc_model": if config['model']['use_lstm']: ModelCatalog.register_custom_model("cc_model", CentralizedCriticModelRNN) else: ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) from flow.agents.centralized_PPO import CCTrainer agent_cls = CCTrainer elif config['model']['custom_model'] == "GRU": ModelCatalog.register_custom_model("GRU", GRU) elif config['model']['custom_model'] == "FeedForward": ModelCatalog.register_custom_model("FeedForward", FeedForward) # If we trained by imitating if "imitation_weight" in config['model']['custom_options'].keys(): from flow.agents.ImitationPPO import ImitationTrainer agent_cls = ImitationTrainer sim_params = flow_params['sim'] sim_params.restart_instance = False dir_path = os.path.dirname(os.path.realpath(__file__)) emission_path = '{0}/test_time_rollout/'.format(dir_path) sim_params.emission_path = emission_path if args.gen_emission else None # pick your rendering mode if args.render_mode == 'sumo_web3d': sim_params.num_clients = 2 sim_params.render = False elif args.render_mode == 'drgb': sim_params.render = 'drgb' sim_params.pxpm = 4 elif args.render_mode == 'sumo_gui': sim_params.render = False elif args.render_mode == 'no_render': sim_params.render = False if args.save_render: sim_params.render = 'drgb' sim_params.pxpm = 4 sim_params.save_render = True # Start the environment with the gui turned on and a path for the # emission file env_params = flow_params['env'] # TODO(@evinitsky) remove this this is a backwards compatibility hack if 'life_penalty' not in env_params.additional_params.keys(): env_params.additional_params['life_penalty'] = - 3 if args.evaluate: env_params.evaluate = True # lower the horizon if testing if args.horizon: config['horizon'] = args.horizon env_params.horizon = args.horizon # Create and register a gym+rllib env create_env, env_name = make_create_env(params=flow_params, version=0) register_env(env_name, create_env) # create the agent that will be used to compute the actions agent = agent_cls(env=env_name, config=config) checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num agent.restore(checkpoint) policy_agent_mapping = default_policy_agent_mapping if hasattr(agent, "workers"): env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { p: m.action_space.sample() for p, m in policy_map.items() } else: env = gym.make(env_name) multiagent = False use_lstm = {DEFAULT_POLICY_ID: False} if args.render_mode == 'sumo_gui': env.sim_params.render = True # set to True after initializing agent and env # Simulate and collect metrics outflow_arr = [] final_outflows = [] final_inflows = [] mean_speed = [] std_speed = [] mean_rewards = [] per_agent_rew = collections.defaultdict(lambda: 0.0) # keep track of the last 500 points of velocity data for lane 0 # and 1 in edge 4 velocity_arr = [] vel = [] mapping_cache = {} # in case policy_agent_mapping is stochastic for j in range(num_trials): agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.0) done = False reward_total = 0.0 obs = env.reset(inflow_rate) k = 0 while k < env_params.horizon and not done: vehicles = env.unwrapped.k.vehicle vel.append(np.mean(vehicles.get_speed(vehicles.get_ids()))) # don't start recording till we have hit the warmup time if k >= env_params.horizon - args.end_len: vehs_on_four = vehicles.get_ids_by_edge('4') lanes = vehicles.get_lane(vehs_on_four) lane_dict = {veh_id: lane for veh_id, lane in zip(vehs_on_four, lanes)} sort_by_lane = sorted(vehs_on_four, key=lambda x: lane_dict[x]) num_zeros = lanes.count(0) if num_zeros > 0: speed_on_zero = np.mean(vehicles.get_speed( sort_by_lane[0:num_zeros])) else: speed_on_zero = 0.0 if num_zeros < len(vehs_on_four): speed_on_one = np.mean(vehicles.get_speed( sort_by_lane[num_zeros:])) else: speed_on_one = 0.0 velocity_arr.append( [inflow_rate, speed_on_zero, speed_on_one]) multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state print(agent_id, a_action) else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action print(agent_id, a_action) action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] # print(action) next_obs, reward, done, _ = env.step(action) if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r per_agent_rew[agent_id] += r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward k += 1 obs = next_obs vehicles = env.unwrapped.k.vehicle outflow = vehicles.get_outflow_rate(500) final_outflows.append(outflow) inflow = vehicles.get_inflow_rate(500) final_inflows.append(inflow) outflow_arr.append([inflow_rate, outflow, outflow/inflow_rate]) mean_speed.append(np.mean(vel)) std_speed.append(np.std(vel)) mean_rewards.append([inflow, np.mean(list(per_agent_rew.values()))]) return [outflow_arr, velocity_arr, mean_speed, std_speed, mean_rewards]
def _process_observations(base_env, policies, batch_builder_pool, active_episodes, unfiltered_obs, rewards, dones, infos, off_policy_actions, horizon, preprocessors, obs_filters, unroll_length, pack, callbacks): """Record new data from the environment and prepare for policy evaluation. Returns: active_envs: set of non-terminated env ids to_eval: map of policy_id to list of agent PolicyEvalData outputs: list of metrics and samples to return from the sampler """ active_envs = set() to_eval = defaultdict(list) outputs = [] # For each environment for env_id, agent_obs in unfiltered_obs.items(): new_episode = env_id not in active_episodes episode = active_episodes[env_id] if not new_episode: episode.length += 1 episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) if (episode.batch_builder.total() > max(1000, unroll_length * 10) and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " "that you set a horizon on your environment correctly. Note " "that in multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " "batches.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: all_done = True atari_metrics = _fetch_atari_metrics(base_env) if atari_metrics is not None: for m in atari_metrics: outputs.append( m._replace(custom_metrics=episode.custom_metrics)) else: outputs.append( RolloutMetrics(episode.length, episode.total_reward, dict(episode.agent_rewards), episode.custom_metrics)) else: all_done = False active_envs.add(env_id) # For each agent in the environment for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) if log_once("prep_obs"): logger.info("Preprocessed obs: {}".format(summarize(prep_obs))) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) if log_once("filtered_obs"): logger.info("Filtered obs: {}".format(summarize(filtered_obs))) agent_done = bool(all_done or dones[env_id].get(agent_id)) if not agent_done: to_eval[policy_id].append( PolicyEvalData(env_id, agent_id, filtered_obs, infos[env_id].get(agent_id, {}), episode.rnn_state_for(agent_id), episode.last_action_for(agent_id), rewards[env_id][agent_id] or 0.0)) last_observation = episode.last_observation_for(agent_id) episode._set_last_observation(agent_id, filtered_obs) episode._set_last_raw_obs(agent_id, raw_obs) episode._set_last_info(agent_id, infos[env_id].get(agent_id, {})) # Record transition info if applicable if (last_observation is not None and infos[env_id].get( agent_id, {}).get("training_enabled", True)): episode.batch_builder.add_values( agent_id, policy_id, t=episode.length - 1, eps_id=episode.episode_id, agent_index=episode._agent_index(agent_id), obs=last_observation, actions=episode.last_action_for(agent_id), rewards=rewards[env_id][agent_id], prev_actions=episode.prev_action_for(agent_id), prev_rewards=episode.prev_reward_for(agent_id), dones=agent_done, infos=infos[env_id].get(agent_id, {}), new_obs=filtered_obs, **episode.last_pi_info_for(agent_id)) # Invoke the step callback after the step is logged to the episode if callbacks.get("on_episode_step"): callbacks["on_episode_step"]({"env": base_env, "episode": episode}) # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. if episode.batch_builder.has_pending_data(): if dones[env_id]["__all__"]: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ episode.batch_builder.count >= unroll_length: outputs.append(episode.batch_builder.build_and_reset(episode)) elif all_done: # Make sure postprocessor stays within one episode episode.batch_builder.postprocess_batch_so_far(episode) if all_done: # Handle episode termination batch_builder_pool.append(episode.batch_builder) if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, "policy": policies, "episode": episode }) del active_episodes[env_id] resetted_obs = base_env.try_reset(env_id) if resetted_obs is None: # Reset not supported, drop this env from the ready list if horizon != float("inf"): raise ValueError( "Setting episode horizon requires reset() support " "from the environment.") else: # Creates a new episode episode = active_episodes[env_id] for agent_id, raw_obs in resetted_obs.items(): policy_id = episode.policy_for(agent_id) policy = _get_or_raise(policies, policy_id) prep_obs = _get_or_raise(preprocessors, policy_id).transform(raw_obs) filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs) episode._set_last_observation(agent_id, filtered_obs) to_eval[policy_id].append( PolicyEvalData( env_id, agent_id, filtered_obs, episode.last_info_for(agent_id) or {}, episode.rnn_state_for(agent_id), np.zeros_like( _flatten_action(policy.action_space.sample())), 0.0)) return active_envs, to_eval, outputs
def rollout(agent, env_name, num_steps, num_episodes=0, video_dir=None, config=None, level_seed=None): policy_agent_mapping = default_policy_agent_mapping env, multiagent, policy_map, use_lstm, state_init = get_env( agent, env_name, config, level_seed=0 if level_seed is None else level_seed, ) action_init = { p: _flatten_action(m.action_space.sample()) for p, m in policy_map.items() } vis_info = collections.defaultdict(list) steps = 0 episodes = 0 all_ep_total_reward = 0 seeds = [] while keep_going(steps, num_steps, episodes, num_episodes): mapping_cache = {} # in case policy_agent_mapping is stochastic env, multiagent, policy_map, use_lstm, _ = get_env( agent, env_name, config, level_seed=episodes if level_seed is None else level_seed, ) obs = env.reset() agent_states = DefaultMapping( lambda agent_id: state_init[mapping_cache[agent_id]]) prev_actions = DefaultMapping( lambda agent_id: action_init[mapping_cache[agent_id]]) prev_rewards = collections.defaultdict(lambda: 0.) done = False reward_total = 0.0 episode_steps = 0 while not done and keep_going(steps, num_steps, episodes, num_episodes): multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs} action_dict = {} for agent_id, a_obs in multi_obs.items(): if a_obs is not None: policy_id = mapping_cache.setdefault( agent_id, policy_agent_mapping(agent_id)) p_use_lstm = use_lstm[policy_id] if p_use_lstm: a_action, p_state, _ = agent.compute_action( a_obs, state=agent_states[agent_id], prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) agent_states[agent_id] = p_state else: a_action = agent.compute_action( a_obs, prev_action=prev_actions[agent_id], prev_reward=prev_rewards[agent_id], policy_id=policy_id) a_action = _flatten_action(a_action) # tuple actions action_dict[agent_id] = a_action prev_actions[agent_id] = a_action action = action_dict action = action if multiagent else action[_DUMMY_AGENT_ID] next_obs, reward, done, info = env.step(action) if done: seeds.append(info["level_seed"]) print(seeds) if hasattr(agent.workers.local_worker().get_policy().model, "object_masks"): obj_masks = agent.workers.local_worker().get_policy( ).model.object_masks().cpu().numpy() vis_info["obj_masks"].append(obj_masks) vis_info["obs"].append(obs) vis_info["reward"].append(reward) episode_steps += 1 if multiagent: for agent_id, r in reward.items(): prev_rewards[agent_id] = r else: prev_rewards[_DUMMY_AGENT_ID] = reward if multiagent: done = done["__all__"] reward_total += sum(reward.values()) else: reward_total += reward env.render() steps += 1 obs = next_obs print("Episode #{}: reward: {} steps: {}".format( episodes, reward_total, episode_steps)) all_ep_total_reward += reward_total if done: episodes += 1 print(f"Average episode reward: {all_ep_total_reward / episodes:.4f}") return vis_info