Esempio n. 1
0
    def policy(self) -> Policy:
        """
        Generates (and caches) the push policy from the loaded model based on the given environment
        and client configuration
        """
        # check if the policy has been cached
        if self._policy is not None:
            return self._policy
        # create the environment that the agent acts in
        env = Environment(self.config)
        # keep querying the agent until the policy is complete
        obs, action, reward, completed = env.observation, env.action_space.sample(
        ), 0, False
        # initialize LSTM state if applicable
        policy_map = self.agent.workers.local_worker().policy_map if hasattr(
            self.agent, "workers") else None
        state = policy_map[DEFAULT_POLICY_ID].get_initial_state(
        ) if policy_map else []
        use_lstm = len(state) > 0

        while not completed:
            # query the agent for an action
            if use_lstm:
                action, state, _ = self.agent.compute_action(
                    obs, state=state, prev_action=action, prev_reward=reward)
            else:
                action = self.agent.compute_action(obs,
                                                   prev_action=action,
                                                   prev_reward=reward)
            action = _flatten_action(action)
            # deflate and apply the action
            obs, reward, completed, _ = env.step(action)
        self._policy = env.policy
        return self._policy
Esempio n. 2
0
        def __init__(self, observation_space, action_space, config):
            assert tf.executing_eagerly()
            Policy.__init__(self, observation_space, action_space, config)
            self._is_training = False
            self._loss_initialized = False
            self._sess = None

            if get_default_config:
                config = dict(get_default_config(), **config)

            if before_init:
                before_init(self, observation_space, action_space, config)

            self.config = config

            if action_sampler_fn:
                if not make_model:
                    raise ValueError(
                        "make_model is required if action_sampler_fn is given")
                self.dist_class = None
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
                    action_space, self.config["model"])

            if make_model:
                self.model = make_model(self, observation_space, action_space,
                                        config)
            else:
                self.model = ModelCatalog.get_model_v2(
                    observation_space,
                    action_space,
                    logit_dim,
                    config["model"],
                    framework="tf",
                )

            self.model({
                SampleBatch.CUR_OBS: tf.convert_to_tensor(
                    np.array([observation_space.sample()])),
                SampleBatch.PREV_ACTIONS: tf.convert_to_tensor(
                    [_flatten_action(action_space.sample())]),
                SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]),
            }, [
                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ], tf.convert_to_tensor([1]))

            if before_loss_init:
                before_loss_init(self, observation_space, action_space, config)

            self._initialize_loss_with_dummy_batch()
            self._loss_initialized = True

            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
                self._optimizer = tf.train.AdamOptimizer(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
Esempio n. 3
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            video_dir=None):
    # Adapted from https://github.com/AIcrowd/neurips2020-procgen-starter-kit/blob/master/rollout.py#L349
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        #env = agent.workers.local_worker().env
        env = gym.make(env_name, render_mode="rgb_array")
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: agent.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(agent))
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: _flatten_action(m.action_space.sample())
        for p, m in policy_map.items()
    }

    steps = 0
    episodes = 0
    rgb_array = []

    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        episode_steps = 0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            episode_steps += 1
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                rgb_array.append(env.render(mode='rgb_array'))
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {} steps: {}".format(
            episodes, reward_total, episode_steps))
        if done:
            episodes += 1
    return rgb_array
Esempio n. 4
0
def visualize_adversaries(config_out_dir, checkpoint_num, grid_size,
                          num_rollouts, outdir, plot_base_case, extension):

    agent_list = []
    index = 0
    # max_index = 20000
    multiagent = True
    for (dirpath, dirnames, filenames) in os.walk(config_out_dir):
        if "params.pkl" in filenames:
            # if index > max_index:
            #     break
            rllib_config, checkpoint = get_config_from_path(
                dirpath, checkpoint_num)
            env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \
                instantiate_rollout(rllib_config, checkpoint)
            agent_list.append(agent)
            # index += 1

    # figure out how many adversaries you have and initialize their grids
    num_adversaries = env.num_adv_strengths * env.advs_per_strength
    if plot_base_case:
        policy_correlation_grid = np.zeros(
            (len(agent_list), len(agent_list) + 1))
    else:
        policy_correlation_grid = np.zeros((len(agent_list), len(agent_list)))

    if plot_base_case:
        adversary_loop_index = len(agent_list) + 1
    else:
        adversary_loop_index = len(agent_list)

    for agent_row_index in range(len(agent_list)):
        print('Outer index {}'.format(agent_row_index))
        for adversary_col_index in range(adversary_loop_index):
            print('Inner index {}'.format(adversary_col_index))
            reward_total = 0.0
            for adversary_index in range(num_adversaries):
                env.curr_adversary = adversary_index
                # turn the adversaries off for the last column
                if adversary_col_index == len(agent_list):
                    env.curr_adversary = -1
                # actually do the rollouts
                for r_itr in range(num_rollouts):
                    print('On iteration {}'.format(r_itr))
                    mapping_cache = {
                    }  # in case policy_agent_mapping is stochastic
                    agent_states = DefaultMapping(
                        lambda agent_id: state_init[mapping_cache[agent_id]])
                    prev_actions = DefaultMapping(
                        lambda agent_id: action_init[mapping_cache[agent_id]])
                    obs = env.reset()
                    prev_rewards = collections.defaultdict(lambda: 0.)
                    done = False
                    step_num = 0
                    while not done:
                        multi_obs = obs if multiagent else {
                            _DUMMY_AGENT_ID: obs
                        }
                        new_obs = {'agent': multi_obs['agent']}
                        # turn the adversaries off for the last column
                        if adversary_col_index < len(agent_list):
                            new_obs.update({
                                'adversary{}'.format(adversary_index):
                                multi_obs['adversary{}'.format(
                                    adversary_index)]
                            })
                        action_dict = {}
                        for agent_id, a_obs in new_obs.items():
                            if 'agent' in agent_id:
                                if a_obs is not None:
                                    policy_id = mapping_cache.setdefault(
                                        agent_id,
                                        policy_agent_mapping(agent_id))
                                    p_use_lstm = use_lstm[policy_id]
                                    if p_use_lstm:
                                        prev_action = _flatten_action(
                                            prev_actions[agent_id])
                                        a_action, p_state, _ = agent_list[
                                            agent_row_index].compute_action(
                                                a_obs,
                                                state=agent_states[agent_id],
                                                prev_action=prev_action,
                                                prev_reward=prev_rewards[
                                                    agent_id],
                                                policy_id=policy_id)
                                        agent_states[agent_id] = p_state
                                    else:
                                        prev_action = _flatten_action(
                                            prev_actions[agent_id])
                                        flat_action = _flatten_action(a_obs)
                                        a_action = agent_list[
                                            agent_row_index].compute_action(
                                                flat_action,
                                                prev_action=prev_action,
                                                prev_reward=prev_rewards[
                                                    agent_id],
                                                policy_id=policy_id)
                            else:
                                if a_obs is not None:
                                    policy_id = mapping_cache.setdefault(
                                        agent_id,
                                        policy_agent_mapping(agent_id))
                                    p_use_lstm = use_lstm[policy_id]
                                    if p_use_lstm:
                                        prev_action = _flatten_action(
                                            prev_actions[agent_id])
                                        a_action, p_state, _ = agent_list[
                                            adversary_col_index].compute_action(
                                                a_obs,
                                                state=agent_states[agent_id],
                                                prev_action=prev_action,
                                                prev_reward=prev_rewards[
                                                    agent_id],
                                                policy_id=policy_id)
                                        agent_states[agent_id] = p_state
                                    else:
                                        prev_action = _flatten_action(
                                            prev_actions[agent_id])
                                        flat_action = _flatten_action(a_obs)
                                        a_action = agent_list[
                                            adversary_col_index].compute_action(
                                                flat_action,
                                                prev_action=prev_action,
                                                prev_reward=prev_rewards[
                                                    agent_id],
                                                policy_id=policy_id)

                            action_dict[agent_id] = a_action
                            prev_action = _flatten_action(
                                a_action)  # tuple actions
                            prev_actions[agent_id] = prev_action

                        action = action_dict

                        action = action if multiagent else action[
                            _DUMMY_AGENT_ID]

                        next_obs, reward, done, info = env.step(action)
                        if isinstance(done, dict):
                            done = done['__all__']
                        step_num += 1
                        if multiagent:
                            for agent_id, r in reward.items():
                                prev_rewards[agent_id] = r
                        else:
                            prev_rewards[_DUMMY_AGENT_ID] = reward

                        # we only want the robot reward, not the adversary reward
                        reward_total += info['agent']['agent_reward']
                        obs = next_obs

            policy_correlation_grid[agent_row_index,
                                    adversary_col_index] = reward_total / (
                                        num_rollouts * num_adversaries)

    file_path = os.path.dirname(os.path.abspath(__file__))
    output_file_path = os.path.join(file_path, outdir)
    if not os.path.exists(output_file_path):
        try:
            os.makedirs(os.path.dirname(output_file_path))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise

    # increasing the row index implies moving down on the y axis
    plt.imshow(policy_correlation_grid,
               interpolation='nearest',
               cmap='seismic',
               aspect='equal',
               vmin=400,
               vmax=3600)
    plt.colorbar()
    fontsize = 14
    title_fontsize = 16
    plt.title('Policy Correlation Matrix', fontsize=title_fontsize)
    if plot_base_case:
        plt.yticks(ticks=np.arange(len(agent_list)))
        plt.xticks(ticks=np.arange(len(agent_list)).tolist().append(['base']))
    else:
        plt.yticks(ticks=np.arange(len(agent_list)))
        plt.xticks(ticks=np.arange(len(agent_list)))
    plt.ylabel('agent index', fontsize=fontsize)
    plt.xlabel('adversary index', fontsize=fontsize)
    output_str = '{}/{}'.format(
        os.path.abspath(os.path.expanduser(outdir)),
        'policy_correlation_map_{}.png'.format(extension))
    with open(
            '{}/{}'.format(os.path.abspath(os.path.expanduser(outdir)),
                           'results_{}'.format(extension)), 'wb') as file:
        np.savetxt(file, policy_correlation_grid)
    plt.tight_layout()
    plt.grid(False)
    plt.savefig(output_str)
def visualizer_rllib(args):
    """Visualizer for RLlib experiments.

    This function takes args (see function create_parser below for
    more detailed information on what information can be fed to this
    visualizer), and renders the experiment associated with it.
    """
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_pkl(result_dir)

    # Run on only one cpu for rendering purposes
    config['num_workers'] = 0

    flow_params = get_flow_params(config)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if args.run and config_run:
        if args.run != config_run:
            print('visualizer_rllib.py: error: run argument ' +
                  '\'{}\' passed in '.format(args.run) +
                  'differs from the one stored in params.json ' +
                  '\'{}\''.format(config_run))
            sys.exit(1)
    if args.run:
        agent_cls = get_agent_class(args.run)
    elif config_run:
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    sim_params = flow_params['sim']
    sim_params.restart_instance = True
    dir_path = os.path.dirname(os.path.realpath(__file__))
    emission_path = '{0}/test_time_rollout/'.format(dir_path)
    sim_params.emission_path = emission_path if args.gen_emission else None

    # pick your rendering mode
    if args.render_mode == 'sumo_web3d':
        sim_params.num_clients = 2
        sim_params.render = False
    elif args.render_mode == 'drgb':
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
    elif args.render_mode == 'sumo_gui':
        sim_params.render = True
        print('NOTE: With render mode {}, an extra instance of the SUMO GUI '
              'will display before the GUI for visualizing the result. Click '
              'the green Play arrow to continue.'.format(args.render_mode))
    elif args.render_mode == 'no_render':
        sim_params.render = False
    if args.save_render:
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
        sim_params.save_render = True

    # Start the environment with the gui turned on and a path for the
    # emission file
    env_params = flow_params['env']
    sim_params.restart_instance = False
    if args.evaluate:
        env_params.evaluate = True

    # lower the horizon if testing
    if args.horizon:
        config['horizon'] = args.horizon
        env_params.horizon = args.horizon

    # Create and register a gym+rllib env
    create_env, env_name = make_create_env(params=flow_params, version=0)
    register_env(env_name, create_env)

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    agent.restore(checkpoint)

    # Simulate and collect metrics
    final_outflows = []
    final_inflows = []
    mean_speed = []
    std_speed = []

    policy_agent_mapping = default_policy_agent_mapping
    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    for i in range(args.num_rollouts):
        vel = []
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        reward_dict = {}
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and steps < (env_params.horizon or steps + 1):
            vehicles = env.unwrapped.k.vehicle
            vel.append(np.mean(vehicles.get_speed(vehicles.get_ids())))
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, _ = env.step(action)

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            steps += 1
            obs = next_obs
        outflow = vehicles.get_outflow_rate(500)
        final_outflows.append(outflow)
        inflow = vehicles.get_inflow_rate(500)
        final_inflows.append(inflow)
        if np.all(np.array(final_inflows) > 1e-5):
            throughput_efficiency = [
                x / y for x, y in zip(final_outflows, final_inflows)
            ]
        else:
            throughput_efficiency = [0] * len(final_inflows)
        mean_speed.append(np.mean(vel))
        std_speed.append(np.std(vel))
        print("Episode reward", reward_total)

    print('==== Summary of results ====')
    print(mean_speed)
    # if multiagent:
    #     for agent_id, rew in rets.items():
    #         print('For agent', agent_id)
    #         print(rew)
    #         print('Average, std return: {}, {} for agent {}'.format(
    #             np.mean(rew), np.std(rew), agent_id))
    # else:
    #     print(rets)
    #     print('Average, std: {}, {}'.format(
    #         np.mean(rets), np.std(rets)))

    print("\nSpeed, mean (m/s): {}".format(mean_speed))
    print('Average, std: {}, {}'.format(np.mean(mean_speed),
                                        np.std(mean_speed)))
    print("\nSpeed, std (m/s): {}".format(std_speed))
    print('Average, std: {}, {}'.format(np.mean(std_speed), np.std(std_speed)))

    # Compute arrival rate of vehicles in the last 500 sec of the run
    print("\nOutflows (veh/hr): {}".format(final_outflows))
    print('Average, std: {}, {}'.format(np.mean(final_outflows),
                                        np.std(final_outflows)))

    # Compute departure rate of vehicles in the last 500 sec of the run
    print("Inflows (veh/hr): {}".format(final_inflows))
    print('Average, std: {}, {}'.format(np.mean(final_inflows),
                                        np.std(final_inflows)))

    # Compute throughput efficiency in the last 500 sec of the
    print("Throughput efficiency (veh/hr): {}".format(throughput_efficiency))
    print('Average, std: {}, {}'.format(np.mean(throughput_efficiency),
                                        np.std(throughput_efficiency)))

    # terminate the environment
    env.unwrapped.terminate()

    # if prompted, convert the emission file into a csv file
    if args.gen_emission:
        time.sleep(0.1)

        dir_path = os.path.dirname(os.path.realpath(__file__))
        emission_filename = '{0}-emission.xml'.format(env.scenario.name)

        emission_path = \
            '{0}/test_time_rollout/{1}'.format(dir_path, emission_filename)

        emission_to_csv(emission_path)

    # if we wanted to save the render, here we create the movie
    if args.save_render:
        dirs = os.listdir(os.path.expanduser('~') + '/flow_rendering')
        # Ignore hidden files
        dirs = [d for d in dirs if d[0] != '.']
        dirs.sort(key=lambda date: datetime.strptime(date, "%Y-%m-%d-%H%M%S"))
        recent_dir = dirs[-1]
        # create the movie
        movie_dir = os.path.expanduser('~') + '/flow_rendering/' + recent_dir
        save_dir = os.path.expanduser('~') + '/flow_movies'
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        os_cmd = "cd " + movie_dir + " && ffmpeg -i frame_%06d.png"
        os_cmd += " -pix_fmt yuv420p " + dirs[-1] + ".mp4"
        os_cmd += "&& cp " + dirs[-1] + ".mp4 " + save_dir + "/"
        os.system(os_cmd)
Esempio n. 6
0
def sample_actions(rllib_config, checkpoint, num_samples, outdir):
    env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \
        instantiate_rollout(rllib_config, checkpoint)
    reset_env(env)

    # figure out how many adversaries you have and initialize their grids
    num_adversaries = env.num_adv_strengths * env.advs_per_strength
    adversary_grid_dict = {}
    for i in range(num_adversaries):
        adversary_str = 'adversary' + str(i)
        # each adversary grid is a map of agent action versus observation dimension
        adversary_grid_dict[adversary_str] = {'sampled_actions': np.zeros((num_samples, env.adv_action_space.shape[0]))}
    agent_dict = {}
    agent_dict['agent'] = {'sampled_actions': np.zeros((num_samples, env.action_space.shape[0]))}

    mapping_cache = {}  # in case policy_agent_mapping is stochastic

    sample_idx = 0
    prev_actions = DefaultMapping(
        lambda agent_id: action_init[mapping_cache[agent_id]])
    prev_rewards = collections.defaultdict(lambda: 0.)
    while sample_idx < num_samples:
        obs = env.reset()['agent']
        done = False
        while not done:
            if env.kl_reward or (env.l2_reward and not env.l2_memory):
                multi_obs = {'adversary{}'.format(i): {"obs": obs, "is_active": np.array([1])} for i in range(num_adversaries)}
            else:
                multi_obs = {'adversary{}'.format(i): obs for i in range(num_adversaries)}
            multi_obs['agent'] = obs
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if not p_use_lstm:
                        flat_obs = _flatten_action(a_obs)
                        a_action = agent.compute_action(
                            flat_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        if agent_id != 'agent':
                            adversary_grid_dict[agent_id]['sampled_actions'][sample_idx] = a_action
                        else:
                            agent_dict['agent']['sampled_actions'][sample_idx] = a_action
                        action_dict[agent_id] = a_action
            new_dict = {}
            new_dict.update({'agent': action_dict['agent']})
            obs, reward, done, info = env.step(new_dict)
            sample_idx += 1

    file_path = os.path.dirname(os.path.abspath(__file__))
    output_file_path = os.path.join(file_path, outdir)
    if not os.path.exists(output_file_path):
        try:
            os.makedirs(os.path.dirname(output_file_path))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise

    # Plot the histogram of the actions
    for adversary, adv_dict in adversary_grid_dict.items():
        sampled_actions = adv_dict['sampled_actions']
        for action_idx in range(sampled_actions.shape[-1]):
            fig = plt.figure()
            plt.hist(sampled_actions[:, action_idx])
            output_str = '{}/{}'.format(outdir, adversary + 'action_{}_histogram.png'.format(action_idx))
            plt.xlabel('Action magnitude')
            plt.ylabel('Frequency')
            plt.title('Histograms of actions over {} sampled obs'.format(num_samples))
            plt.savefig(output_str)
            plt.close(fig)


        fig = plt.figure()
        plt.hist2d(sampled_actions[:, 0], sampled_actions[:, 1])
        output_str = '{}/{}'.format(outdir, adversary + 'action_2dhistogram.png')
        plt.xlabel('Action 1 magnitude')
        plt.ylabel('Action 2 magnitude')
        plt.title('Histograms of actions over {} sampled obs'.format(num_samples))
        plt.savefig(output_str)
        plt.close(fig)

    for agent, agent_dict in agent_dict.items():
        sampled_actions = agent_dict['sampled_actions']
        for action_idx in range(sampled_actions.shape[-1]):
            fig = plt.figure()
            plt.hist(sampled_actions[:, action_idx])
            output_str = '{}/{}'.format(outdir, agent + 'action_{}_histogram.png'.format(action_idx))
            plt.xlabel('Action magnitude')
            plt.ylabel('Frequency')
            plt.title('Histograms of actions over {} sampled obs'.format(num_samples))
            plt.savefig(output_str)
            plt.close(fig)

        fig = plt.figure()
        plt.hist2d(sampled_actions[:, 0], sampled_actions[:, 1])
        output_str = '{}/{}'.format(outdir, agent + 'action_2dhistogram.png')
        plt.xlabel('Action 1 magnitude')
        plt.ylabel('Action 2 magnitude')
        plt.title('Histograms of actions over {} sampled obs'.format(num_samples))
        plt.savefig(output_str)
        plt.close(fig)
Esempio n. 7
0
def _env_runner(async_vector_env,
                extra_batch_callback,
                policies,
                policy_mapping_fn,
                unroll_length,
                horizon,
                obs_filters,
                clip_rewards,
                pack,
                tf_sess=None):
    """This implements the common experience collection logic.

    Args:
        async_vector_env (AsyncVectorEnv): env implementing AsyncVectorEnv.
        extra_batch_callback (fn): function to send extra batch data to.
        policies (dict): Map of policy ids to PolicyGraph instances.
        policy_mapping_fn (func): Function that maps agent ids to policy ids.
            This is called when an agent first enters the environment. The
            agent is then "bound" to the returned policy for the episode.
        unroll_length (int): Number of episode steps before `SampleBatch` is
            yielded. Set to infinity to yield complete episodes.
        horizon (int): Horizon of the episode.
        obs_filters (dict): Map of policy id to filter used to process
            observations for the policy.
        clip_rewards (bool): Whether to clip rewards before postprocessing.
        pack (bool): Whether to pack multiple episodes into each batch. This
            guarantees batches will be exactly `unroll_length` in size.
        tf_sess (Session|None): Optional tensorflow session to use for batching
            TF policy evaluations.

    Yields:
        rollout (SampleBatch): Object containing state, action, reward,
            terminal condition, and other fields as dictated by `policy`.
    """

    try:
        if not horizon:
            horizon = (
                async_vector_env.get_unwrapped()[0].spec.max_episode_steps)
    except Exception:
        logger.warn("no episode horizon specified, assuming inf")
    if not horizon:
        horizon = float("inf")

    # Pool of batch builders, which can be shared across episodes to pack
    # trajectory data.
    batch_builder_pool = []

    def get_batch_builder():
        if batch_builder_pool:
            return batch_builder_pool.pop()
        else:
            return MultiAgentSampleBatchBuilder(policies, clip_rewards)

    def new_episode():
        return MultiAgentEpisode(policies, policy_mapping_fn,
                                 get_batch_builder, extra_batch_callback)

    active_episodes = defaultdict(new_episode)

    while True:
        # Get observations from all ready agents
        unfiltered_obs, rewards, dones, infos, off_policy_actions = \
            async_vector_env.poll()

        # Map of policy_id to list of PolicyEvalData
        to_eval = defaultdict(list)

        # Map of env_id -> agent_id -> action replies
        actions_to_send = defaultdict(dict)

        # For each environment
        for env_id, agent_obs in unfiltered_obs.items():
            new_episode = env_id not in active_episodes
            episode = active_episodes[env_id]
            if not new_episode:
                episode.length += 1
                episode.batch_builder.count += 1
                episode._add_agent_rewards(rewards[env_id])

            # Check episode termination conditions
            if dones[env_id]["__all__"] or episode.length >= horizon:
                all_done = True
                atari_metrics = _fetch_atari_metrics(async_vector_env)
                if atari_metrics is not None:
                    for m in atari_metrics:
                        yield m
                else:
                    yield RolloutMetrics(episode.length, episode.total_reward,
                                         dict(episode.agent_rewards))
            else:
                all_done = False
                # At least send an empty dict if not done
                actions_to_send[env_id] = {}

            # For each agent in the environment
            for agent_id, raw_obs in agent_obs.items():
                policy_id = episode.policy_for(agent_id)
                filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs)
                agent_done = bool(all_done or dones[env_id].get(agent_id))
                if not agent_done:
                    to_eval[policy_id].append(
                        PolicyEvalData(env_id, agent_id, filtered_obs,
                                       episode.rnn_state_for(agent_id),
                                       episode.last_action_for(agent_id),
                                       rewards[env_id][agent_id] or 0.0))

                last_observation = episode.last_observation_for(agent_id)
                episode._set_last_observation(agent_id, filtered_obs)

                # Record transition info if applicable
                if last_observation is not None and \
                        infos[env_id][agent_id].get("training_enabled", True):
                    episode.batch_builder.add_values(
                        agent_id,
                        policy_id,
                        t=episode.length - 1,
                        eps_id=episode.episode_id,
                        obs=last_observation,
                        actions=episode.last_action_for(agent_id),
                        rewards=rewards[env_id][agent_id],
                        prev_actions=episode.prev_action_for(agent_id),
                        prev_rewards=episode.prev_reward_for(agent_id),
                        dones=agent_done,
                        infos=infos[env_id][agent_id],
                        new_obs=filtered_obs,
                        **episode.last_pi_info_for(agent_id))

            # Cut the batch if we're not packing multiple episodes into one,
            # or if we've exceeded the requested batch size.
            if episode.batch_builder.has_pending_data():
                if (all_done and not pack) or \
                        episode.batch_builder.count >= unroll_length:
                    yield episode.batch_builder.build_and_reset(episode)
                elif all_done:
                    # Make sure postprocessor stays within one episode
                    episode.batch_builder.postprocess_batch_so_far(episode)

            if all_done:
                # Handle episode termination
                batch_builder_pool.append(episode.batch_builder)
                del active_episodes[env_id]
                resetted_obs = async_vector_env.try_reset(env_id)
                if resetted_obs is None:
                    # Reset not supported, drop this env from the ready list
                    assert horizon == float("inf"), \
                        "Setting episode horizon requires reset() support."
                else:
                    # Creates a new episode
                    episode = active_episodes[env_id]
                    for agent_id, raw_obs in resetted_obs.items():
                        policy_id = episode.policy_for(agent_id)
                        policy = _get_or_raise(policies, policy_id)
                        filtered_obs = _get_or_raise(obs_filters,
                                                     policy_id)(raw_obs)
                        episode._set_last_observation(agent_id, filtered_obs)
                        to_eval[policy_id].append(
                            PolicyEvalData(
                                env_id, agent_id, filtered_obs,
                                episode.rnn_state_for(agent_id),
                                np.zeros_like(
                                    _flatten_action(
                                        policy.action_space.sample())), 0.0))

        # Batch eval policy actions if possible
        if tf_sess:
            builder = TFRunBuilder(tf_sess, "policy_eval")
            pending_fetches = {}
        else:
            builder = None
        eval_results = {}
        rnn_in_cols = {}
        for policy_id, eval_data in to_eval.items():
            rnn_in = _to_column_format([t.rnn_state for t in eval_data])
            rnn_in_cols[policy_id] = rnn_in
            policy = _get_or_raise(policies, policy_id)
            if builder and (policy.compute_actions.__code__ is
                            TFPolicyGraph.compute_actions.__code__):
                pending_fetches[policy_id] = policy.build_compute_actions(
                    builder, [t.obs for t in eval_data],
                    rnn_in,
                    prev_action_batch=[t.prev_action for t in eval_data],
                    prev_reward_batch=[t.prev_reward for t in eval_data],
                    is_training=True)
            else:
                eval_results[policy_id] = policy.compute_actions(
                    [t.obs for t in eval_data],
                    rnn_in,
                    prev_action_batch=[t.prev_action for t in eval_data],
                    prev_reward_batch=[t.prev_reward for t in eval_data],
                    is_training=True,
                    episodes=[active_episodes[t.env_id] for t in eval_data])
        if builder:
            for k, v in pending_fetches.items():
                eval_results[k] = builder.get(v)

        # Record the policy eval results
        for policy_id, eval_data in to_eval.items():
            actions, rnn_out_cols, pi_info_cols = eval_results[policy_id]
            if len(rnn_in_cols[policy_id]) != len(rnn_out_cols):
                raise ValueError(
                    "Length of RNN in did not match RNN out, got: "
                    "{} vs {}".format(rnn_in_cols[policy_id], rnn_out_cols))
            # Add RNN state info
            for f_i, column in enumerate(rnn_in_cols[policy_id]):
                pi_info_cols["state_in_{}".format(f_i)] = column
            for f_i, column in enumerate(rnn_out_cols):
                pi_info_cols["state_out_{}".format(f_i)] = column
            # Save output rows
            actions = _unbatch_tuple_actions(actions)
            for i, action in enumerate(actions):
                env_id = eval_data[i].env_id
                agent_id = eval_data[i].agent_id
                actions_to_send[env_id][agent_id] = action
                episode = active_episodes[env_id]
                episode._set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
                episode._set_last_pi_info(
                    agent_id, {k: v[i]
                               for k, v in pi_info_cols.items()})
                if env_id in off_policy_actions and \
                        agent_id in off_policy_actions[env_id]:
                    episode._set_last_action(
                        agent_id, off_policy_actions[env_id][agent_id])
                else:
                    episode._set_last_action(agent_id, action)

        # Return computed actions to ready envs. We also send to envs that have
        # taken off-policy actions; those envs are free to ignore the action.
        async_vector_env.send_actions(dict(actions_to_send))
Esempio n. 8
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            video_dir=None):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: agent.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(agent))
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: _flatten_action(m.action_space.sample())
        for p, m in policy_map.items()
    }

    # If monitoring has been requested, manually wrap our environment with a
    # gym monitor, which is set to record every episode.
    if video_dir:
        env = gym.wrappers.Monitor(env=env,
                                   directory=video_dir,
                                   video_callable=lambda x: True,
                                   force=True)

    steps = 0
    episodes = 0
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        episode_steps = 0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            episode_steps += 1
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {} steps: {}".format(
            episodes, reward_total, episode_steps))
        if done:
            episodes += 1
Esempio n. 9
0
def _process_observations(worker, base_env, policies, batch_builder_pool,
                          active_episodes, unfiltered_obs, rewards, dones,
                          infos, off_policy_actions, horizon, preprocessors,
                          obs_filters, rollout_fragment_length, pack,
                          callbacks, soft_horizon, no_done_at_end):
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []
    large_batch_threshold = max(1000, rollout_fragment_length * 10) if \
        rollout_fragment_length != float("inf") else 5000

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        if (episode.batch_builder.total() > large_batch_threshold
                and log_once("large_batch_warning")):
            logger.warning(
                "More than {} observations for {} env steps ".format(
                    episode.batch_builder.total(),
                    episode.batch_builder.count) + "are buffered in "
                "the sampler. If this is more than you expected, check that "
                "that you set a horizon on your environment correctly and that"
                " it terminates at some point. "
                "Note: In multi-agent environments, `rollout_fragment_length` "
                "sets the batch size based on environment steps, not the "
                "steps of "
                "individual agents, which can result in unexpectedly large "
                "batches. Also, you may be in evaluation waiting for your Env "
                "to terminate (batch_mode=`complete_episodes`). Make sure it "
                "does at some point.")

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            hit_horizon = (episode.length >= horizon
                           and not dones[env_id]["__all__"])
            all_done = True
            atari_metrics = _fetch_atari_metrics(base_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics, {},
                                   episode.hist_data))
        else:
            hit_horizon = False
            all_done = False
            active_envs.add(env_id)

        # For each agent in the environment.
        for agent_id, raw_obs in agent_obs.items():
            policy_id = episode.policy_for(agent_id)
            prep_obs = _get_or_raise(preprocessors,
                                     policy_id).transform(raw_obs)
            if log_once("prep_obs"):
                logger.info("Preprocessed obs: {}".format(summarize(prep_obs)))

            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
            if log_once("filtered_obs"):
                logger.info("Filtered obs: {}".format(summarize(filtered_obs)))

            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   infos[env_id].get(agent_id, {}),
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)
            episode._set_last_raw_obs(agent_id, raw_obs)
            episode._set_last_info(agent_id, infos[env_id].get(agent_id, {}))

            # Record transition info if applicable
            if (last_observation is not None and infos[env_id].get(
                    agent_id, {}).get("training_enabled", True)):
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=(False if (no_done_at_end
                                     or (hit_horizon and soft_horizon)) else
                           agent_done),
                    infos=infos[env_id].get(agent_id, {}),
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

        # Invoke the step callback after the step is logged to the episode
        callbacks.on_episode_step(
            worker=worker, base_env=base_env, episode=episode)

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_agent_data():
            if dones[env_id]["__all__"] and not no_done_at_end:
                episode.batch_builder.check_missing_dones()
            if (all_done and not pack) or \
                    episode.batch_builder.count >= rollout_fragment_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            # Call each policy's Exploration.on_episode_end method.
            for p in policies.values():
                p.exploration.on_episode_end(
                    policy=p,
                    environment=base_env,
                    episode=episode,
                    tf_sess=getattr(p, "_sess", None))
            # Call custom on_episode_end callback.
            callbacks.on_episode_end(
                worker=worker,
                base_env=base_env,
                policies=policies,
                episode=episode)
            if hit_horizon and soft_horizon:
                episode.soft_reset()
                resetted_obs = agent_obs
            else:
                del active_episodes[env_id]
                resetted_obs = base_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            elif resetted_obs != ASYNC_RESET_RETURN:
                # Creates a new episode if this is not async return
                # If reset is async, we will get its result in some future poll
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():
                    policy_id = episode.policy_for(agent_id)
                    policy = _get_or_raise(policies, policy_id)
                    prep_obs = _get_or_raise(preprocessors,
                                             policy_id).transform(raw_obs)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(prep_obs)
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs,
                            episode.last_info_for(agent_id) or {},
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

    return active_envs, to_eval, outputs
Esempio n. 10
0
def _process_observations(base_env, policies, batch_builder_pool,
                          active_episodes, unfiltered_obs, rewards, dones,
                          infos, off_policy_actions, horizon, preprocessors,
                          obs_filters, unroll_length, pack, callbacks,
                          soft_horizon, no_done_at_end):
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """
    global i
    global tmp_dic
    global traffic_light_node_dict
    i += 1

    def inter_num_2_id(num):
        return list(tmp_dic.keys())[list(tmp_dic.values()).index(num)]

    def read_traffic_light_node_dict():
        path_to_read = os.path.join(record_dir, 'traffic_light_node_dict.conf')
        with open(path_to_read, 'r') as f:
            traffic_light_node_dict = eval(f.read())
            print("Read traffic_light_node_dict")
            return traffic_light_node_dict

    if i <= 1:
        # 此处用于从配置文件读入 neighbor 情况
        record_dir = base_env.envs[0].record_dir
        traffic_light_node_dict = base_env.envs[0].traffic_light_node_dict
        tmp_dic = traffic_light_node_dict['intersection_1_1'][
            'inter_id_to_index']

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        if (episode.batch_builder.total() > max(1000, unroll_length * 10)
                and log_once("large_batch_warning")):
            logger.warning(
                "More than {} observations for {} env steps ".format(
                    episode.batch_builder.total(),
                    episode.batch_builder.count) + "are buffered in "
                "the sampler. If this is more than you expected, check that "
                "that you set a horizon on your environment correctly. Note "
                "that in multi-agent environments, `sample_batch_size` sets "
                "the batch size based on environment steps, not the steps of "
                "individual agents, which can result in unexpectedly large "
                "batches.")

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            hit_horizon = (episode.length >= horizon
                           and not dones[env_id]["__all__"])
            all_done = True
            atari_metrics = _fetch_atari_metrics(base_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics, {}))
        else:
            hit_horizon = False
            all_done = False
            active_envs.add(env_id)

        # For each agent in the environment
        for agent_id, raw_obs in agent_obs.items():
            policy_id = episode.policy_for(agent_id)  # eg: "policy_0"
            # print(policy_id)
            prep_obs = _get_or_raise(preprocessors,
                                     policy_id).transform(raw_obs)
            if log_once("prep_obs"):
                logger.info("Preprocessed obs: {}".format(summarize(prep_obs)))

            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
            '''
            For Attention !!!!!!!!!!!!!!!!!!!!
            这里要执行的是实时的Q eval, 因此要Q eval 网络传neighbor_obs值
            '''
            # 根据 traffic_light_node_dict 字典中的路网关系, 找到当前 policy_id 的 neighbor, 并保存成 "policy_0" 的形式
            neighbor_pid_list = [
                'policy_{}'.format(pid_)
                for pid_ in traffic_light_node_dict[inter_num_2_id(
                    int(policy_id.split('_')[1]))]['adjacency_row']
                if pid_ != None
            ]
            # print(neighbor_pid_list)
            neighbor_obs = []
            neighbor_obs.append([])

            # Size: (1, 5, 15) 只有这个形式才能传入neighbor_obs (batch, 5, 15) 的 Placeholder
            i = 0
            for neighbor_id in neighbor_pid_list:
                neighbor_prep_obs = _get_or_raise(
                    preprocessors, neighbor_id).transform(raw_obs)
                neighbor_filtered_obs = _get_or_raise(
                    obs_filters, neighbor_id)(neighbor_prep_obs)
                neighbor_obs[0].append(neighbor_filtered_obs)
                i += 1
            neighbor_obs = np.array(neighbor_obs).reshape(
                (len(neighbor_pid_list), len(raw_obs)))  # (5, 29)

            # ------------------------------------------------------------------
            if log_once("filtered_obs"):
                logger.info("Filtered obs: {}".format(summarize(filtered_obs)))

            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   neighbor_obs,
                                   infos[env_id].get(agent_id, {}),
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)
            episode._set_last_raw_obs(agent_id, raw_obs)
            episode._set_last_info(agent_id, infos[env_id].get(agent_id, {}))

            # Record transition info if applicable
            if (last_observation is not None and infos[env_id].get(
                    agent_id, {}).get("training_enabled", True)):
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=(False if
                           (no_done_at_end or
                            (hit_horizon and soft_horizon)) else agent_done),
                    infos=infos[env_id].get(agent_id, {}),
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

        # Invoke the step callback after the step is logged to the episode
        if callbacks.get("on_episode_step"):
            callbacks["on_episode_step"]({"env": base_env, "episode": episode})

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_data():
            if dones[env_id]["__all__"] and not no_done_at_end:
                episode.batch_builder.check_missing_dones()
            if (all_done and not pack) or \
                    episode.batch_builder.count >= unroll_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            if callbacks.get("on_episode_end"):
                callbacks["on_episode_end"]({
                    "env": base_env,
                    "policy": policies,
                    "episode": episode
                })
            if hit_horizon and soft_horizon:
                episode.soft_reset()
                resetted_obs = agent_obs
            else:
                del active_episodes[env_id]
                resetted_obs = base_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            elif resetted_obs != ASYNC_RESET_RETURN:
                # Creates a new episode if this is not async return
                # If reset is async, we will get its result in some future poll
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():
                    policy_id = episode.policy_for(agent_id)  # eg: "policy_0"
                    policy = _get_or_raise(policies, policy_id)
                    prep_obs = _get_or_raise(preprocessors,
                                             policy_id).transform(raw_obs)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(prep_obs)
                    # print('policy_id' + str(policy_id))
                    # print('filtered_obs' + str(filtered_obs))
                    '''
                    For Attention !!!!!!!!!!!!!!!!!!!!
                    这里是episode终止, create a new episode
                    这里要执行的是实时的Q eval, 因此要Q eval 网络传neighbor_obs值
                    '''
                    # 根据 traffic_light_node_dict 字典中的路网关系, 找到当前 policy_id 的 neighbor, 并保存成 "policy_0" 的形式
                    neighbor_pid_list = [
                        'policy_{}'.format(pid_)
                        for pid_ in traffic_light_node_dict[inter_num_2_id(
                            int(policy_id.split('_')[1]))]['adjacency_row']
                        if pid_ != None
                    ]
                    # print(neighbor_pid_list)
                    neighbor_obs = []
                    neighbor_obs.append([])

                    # Size: (1, 5, 29) 只有这个形式才能传入neighbor_obs (batch, 5, 17) 的 Placeholder
                    i = 0
                    for neighbor_id in neighbor_pid_list:
                        neighbor_prep_obs = _get_or_raise(
                            preprocessors, neighbor_id).transform(raw_obs)
                        neighbor_filtered_obs = _get_or_raise(
                            obs_filters, neighbor_id)(neighbor_prep_obs)
                        neighbor_obs[0].append(neighbor_filtered_obs)
                        i += 1
                    neighbor_obs = np.squeeze(np.array(neighbor_obs))

                    # ------------------------------------------------------------------
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs, neighbor_obs,
                            episode.last_info_for(agent_id) or {},
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

    return active_envs, to_eval, outputs
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            monitor=False):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if monitor and not no_render and saver and saver.outfile is not None:
        # If monitoring has been requested,
        # manually wrap our environment with a gym monitor
        # which is set to record every episode.
        env = gym.wrappers.Monitor(
            env, os.path.join(os.path.dirname(saver.outfile), "monitor"),
            lambda x: True)

    steps = 0
    episodes = 0
    if not no_render:
        shape = env.base_map.shape
        full_obs = [
            np.zeros((shape[0], shape[1], 3), dtype=np.uint8)
            for i in range(num_steps * num_episodes)
        ]
    while episodes < num_episodes:
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        intrinsic_total = 0.0
        env_total = 0.0
        out_file = open('videos/communication_log.txt', 'w')
        out_file.write(f'\n\n episode-{episodes} \n\n')
        while not done and steps < num_steps:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
                intrinsic_total += sum([f['intrinsic'] for f in info.values()])
                env_total += sum([f['environmental'] for f in info.values()])
            else:
                reward_total += reward
            if not no_render:
                # env.render()
                rgb_arr = env.map_to_colors()
                full_obs[steps + (num_steps * episodes)] = rgb_arr.astype(
                    np.uint8)
                out_file.write(f'step-{steps}: {action}\n')

            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        out_file.close()
        saver.end_rollout()
        print("Episode #{}: reward: {}, intrinsic: {}, env: {}".format(
            episodes, reward_total, intrinsic_total, env_total))
        episodes += 1
        steps = 0

    if not no_render:
        path = os.path.abspath(os.path.dirname(__file__)) + '/videos'
        print('saving video to ', path)
        if not os.path.exists(path):
            os.makedirs(path)
        images_path = path + '/images/'
        if not os.path.exists(images_path):
            os.makedirs(images_path)
        utility_funcs.make_video_from_rgb_imgs(full_obs, path)

        # Clean up images
        shutil.rmtree(images_path)
Esempio n. 12
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=RolloutSaver(),
            no_render=True,
            monitor=False):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if monitor and not no_render and saver and saver.outfile is not None:
        # If monitoring has been requested,
        # manually wrap our environment with a gym monitor
        # which is set to record every episode.
        env = gym.wrappers.Monitor(
            env, os.path.join(os.path.dirname(saver.outfile), "monitor"),
            lambda x: True)

    steps = 0
    episodes = 0
    ##################################################################### start Roman
    MeshStates = #initialize Mesh
    import cpvpy as cp # import function to calculate distance
    from .state_distance_calculator.py import is_distance_threshold_exceeded # import function to calculate distance 
    ##################################################################### end Roman
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
	################################################################# start Roman
        number_of_remaining_simulation_steps = 20 ########### amount of simulation steps we want to run until we log into our state trasition matrix [(1 dt)= (1 step in simulation) = (env.instance.emv.scene.dt=0.0165) = (env_instance.env.scene.frameskip = 4) * (env_instance.env.scene.timestep = 0.004125) ]
        ################################################################# end Roman
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            number_of_remaining_simulation_steps -= 1 
            print("RLLibStates: ")
            print("Observation: ", next_obs)
            print("Observation type: ", type(next_obs))
            print("Observation Shape: ", next_obs.shape)
            if done: ############################ it is time to log into our mesh, my dudes 
                if number_of_remaining_simulation_steps > 0: ##### if we failed
                    #increment failure state (state 0, or whatever we call it)
                    pass
                else:                                        #### if the episode continued loger than our threshold of simulation steps
                    #increment current state
                    pass  
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {}".format(episodes, reward_total))
        if done:
            episodes += 1
Esempio n. 13
0
def rollout(agent, env_name, num_steps, out=None, no_render=True, render_q=False, save_q=False):
    policy_agent_mapping = default_policy_agent_mapping
    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if out is not None:
        rollouts = []

    steps = 0
    while steps < (num_steps or steps + 1):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        if out is not None:
            rollout = []
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
                    # Custom code for getting Q values
                    #q = get_q_value(env, agent, policy_id, a_obs)
                    #print("Q",q)

            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            print(action)

            next_obs, reward, done, _ = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            #if not no_render:
            frame = env.render(width=RENDER_WIDTH, height=RENDER_HEIGHT)
            bgr = cv2.cvtColor(frame[:,:,:3], cv2.COLOR_RGB2BGR)
            video.write(bgr)

            if out is not None:
                rollout.append([obs, action, next_obs, reward, done])
            steps += 1
            obs = next_obs
        if out is not None:
            rollouts.append(rollout)
        print("Episode reward", reward_total)
Esempio n. 14
0
def _process_observations(base_env, policies, policies_to_train, dead_policies,
                          policy_config, observation_filter, tf_sess,
                          batch_builder_pool, active_episodes, unfiltered_obs,
                          rewards, dones, infos, off_policy_actions, horizon,
                          preprocessors, obs_filters, unroll_length, pack,
                          callbacks, soft_horizon, no_done_at_end):
    #===MOD===
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        if (episode.batch_builder.total() > max(1000, unroll_length * 10)
                and log_once("large_batch_warning")):
            logger.warning(
                "More than {} observations for {} env steps ".format(
                    episode.batch_builder.total(),
                    episode.batch_builder.count) + "are buffered in "
                "the sampler. If this is more than you expected, check that "
                "that you set a horizon on your environment correctly. Note "
                "that in multi-agent environments, `sample_batch_size` sets "
                "the batch size based on environment steps, not the steps of "
                "individual agents, which can result in unexpectedly large "
                "batches.")

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            # DEBUG
            # print("Trying to terminate.")
            # print("Dones of __all__ is set:", dones[env_id]["__all__"])
            # print("Horizon hit:", episode.length >= horizon)
            hit_horizon = (episode.length >= horizon
                           and not dones[env_id]["__all__"])
            all_done = True
            atari_metrics = _fetch_atari_metrics(base_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics, {}))
        else:
            hit_horizon = False
            all_done = False
            active_envs.add(env_id)

        #===MOD===
        additional_builders_ids = set()
        #===MOD===

        # For each agent in the environment
        for agent_id, raw_obs in agent_obs.items():

            #===MOD===
            policy_id, policy_constructor_tuple = episode.policy_for(agent_id)
            pols_tuple = generate_policies(
                policy_id,
                policy_constructor_tuple,
                policies,
                policies_to_train,
                dead_policies,
                policy_config,
                preprocessors,
                obs_filters,
                observation_filter,
                tf_sess,
            )
            policies, preprocessors, obs_filters, policies_to_train, dead_policies = pols_tuple
            #===MOD===

            prep_obs = _get_or_raise(preprocessors,
                                     policy_id).transform(raw_obs)
            if log_once("prep_obs"):
                logger.info("Preprocessed obs: {}".format(summarize(prep_obs)))

            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
            if log_once("filtered_obs"):
                logger.info("Filtered obs: {}".format(summarize(filtered_obs)))

            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   infos[env_id].get(agent_id, {}),
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)
            episode._set_last_raw_obs(agent_id, raw_obs)
            episode._set_last_info(agent_id, infos[env_id].get(agent_id, {}))

            # Record transition info if applicable
            if (last_observation is not None and infos[env_id].get(
                    agent_id, {}).get("training_enabled", True)):
                #===MOD===
                additional_builders_ids.add(agent_id)
                #===MOD===
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=(False if
                           (no_done_at_end or
                            (hit_horizon and soft_horizon)) else agent_done),
                    infos=infos[env_id].get(agent_id, {}),
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

            #===MOD===
            if agent_done:
                # Does it make sense to remove agent id from `agent_builders`?
                dead_policies.add(policy_id)
                print("Removing agent id from agent builders: %s" %
                      str(agent_id))
                episode.batch_builder.agent_builders.pop(agent_id)
                if policy_id in to_eval:
                    to_eval.pop(policy_id)
                    # print("Popping policy id from toeval.")
            #===MOD===

        start = time.time()

        #===MOD===
        print("sampler.py: ids added to agent builders:\t",
              additional_builders_ids)
        # Update ``self.policy_map`` in ``MultiAgentSampleBatchBuilder``.
        # TODO: policies is not being pruned in this file.
        episode.batch_builder.policy_map = policies
        print("sampler.py: policies: \t", policies.keys())
        #===MOD===

        # Invoke the step callback after the step is logged to the episode
        if callbacks.get("on_episode_step"):
            callbacks["on_episode_step"]({"env": base_env, "episode": episode})

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_data():
            if dones[env_id]["__all__"] and not no_done_at_end:
                episode.batch_builder.check_missing_dones()
            if (all_done and not pack) or \
                    episode.batch_builder.count >= unroll_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                # KEYERROR
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            if callbacks.get("on_episode_end"):
                callbacks["on_episode_end"]({
                    "env": base_env,
                    "policy": policies,
                    "episode": episode
                })
            if hit_horizon and soft_horizon:
                episode.soft_reset()
                resetted_obs = agent_obs
            else:
                del active_episodes[env_id]
                resetted_obs = base_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            elif resetted_obs != ASYNC_RESET_RETURN:
                # print("Executing new epsiode non-async return.")
                time.sleep(1)
                raise NotImplementedError(
                    "Multiple episodes not supported by design.")
                # Creates a new episode if this is not async return
                # If reset is async, we will get its result in some future poll
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():

                    #===MOD===
                    policy_id, policy_constructor_tuple = episode.policy_for(
                        agent_id)
                    # with tf_sess.as_default():
                    pols_tuple = generate_policies(
                        policy_id,
                        policy_constructor_tuple,
                        policies,
                        policies_to_train,
                        dead_policies,
                        policy_config,
                        preprocessors,
                        obs_filters,
                        observation_filter,
                        tf_sess,
                    )
                    policies, preprocessors, obs_filters, policies_to_train, dead_policies = pols_tuple
                    #===MOD===

                    policy = _get_or_raise(policies, policy_id)
                    prep_obs = _get_or_raise(preprocessors,
                                             policy_id).transform(raw_obs)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(prep_obs)
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs,
                            episode.last_info_for(agent_id) or {},
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

        #===MOD===
        pols_tuple = (policies, preprocessors, obs_filters, policies_to_train,
                      dead_policies)
        #===MOD===
    #===MOD===
    return active_envs, to_eval, outputs, pols_tuple
Esempio n. 15
0
def rollout(agent,
            env,
            env_name,
            num_steps=None,
            require_frame=False,
            require_trajectory=False,
            require_extra_info=False,
            require_full_frame=False,
            require_env_state=False,
            render_mode="rgb_array",
            num_rollouts=1,
            multiagent_environment=False):
    assert require_frame or require_trajectory or require_extra_info or \
           require_env_state, "You must ask for some output!"

    # if num_steps is None:
    #     num_steps = 3000

    policy_agent_mapping = default_policy_agent_mapping

    if isinstance(agent, SymbolicAgentBase):
        agent = agent.get()['agent']
        print("Successfully restore agent at remote worker "
              "from symbolic agent!")

    if hasattr(agent, "workers"):
        # env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        # env = gym.make(env_name)
        multiagent = multiagent_environment
        policy = agent.policy
        state_init = {DEFAULT_POLICY_ID: None}
        use_lstm = {p: None for p, s in state_init.items()}
        action_init = {DEFAULT_POLICY_ID: policy.action_space.sample()}

    steps = 0
    now = time.time()
    start = now
    result_list = []
    # while steps < (num_steps or steps + 1):
    for i in range(num_rollouts):
        if require_trajectory:
            trajectory = []
        if require_frame:
            frames = []
            # assert env_name in ["BipedalWalker-v2"]
            frame_extra_info = {
                "value_function": [],
                "reward": [],
                "done": [],
                "step": [],
                "period_info": []
            }
        if require_extra_info:
            extra_infos = []
        if require_env_state:
            env_states = []

        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        obs = env.reset()

        if require_env_state:
            env_states.append(copy.deepcopy(env.get_state_wrap()))

        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0 if not multiagent else {}

        while not done and steps < (num_steps or steps + 1):
            if steps % LOG_INTERVAL_STEPS == (LOG_INTERVAL_STEPS - 1):
                logging.info("Current Steps: {}, Time Elapsed: {:.2f}s, "
                             "Last {} Steps Time: {:.2f}s".format(
                                 steps,
                                 time.time() - start, LOG_INTERVAL_STEPS,
                                 time.time() - now))
                now = time.time()
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            value_functions = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, a_info = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        # This is a workaround
                        if "ES" in agent._name:
                            a_action = agent.compute_action(a_obs)
                            a_info = {}
                        else:
                            a_action, _, a_info = agent.compute_action(
                                a_obs,
                                prev_action=prev_actions[agent_id],
                                prev_reward=prev_rewards[agent_id],
                                policy_id=policy_id,
                                full_fetch=True)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
                    if require_extra_info:
                        extra_infos.append(a_info)
                    # This is a work around
                    if "ES" not in agent._name:
                        value_functions[agent_id] = a_info["vf_preds"]
            # This is a work around
            if require_frame and ("ES" not in agent._name):
                frame_extra_info['value_function'].append(
                    value_functions[_DUMMY_AGENT_ID])

            if multiagent_environment:
                action = action_dict
            else:
                action = action_dict[_DUMMY_AGENT_ID]

            next_obs, reward, raw_done, _ = env.step(action)

            if multiagent:
                done = raw_done["__all__"]

                for rewk, rewv in reward.items():
                    if rewk not in reward_total:
                        reward_total[rewk] = 0.0
                    reward_total[rewk] += rewv
                #
                #
                # reward_total += sum(reward.values())
            else:
                done = raw_done
                reward_total += reward

            if require_frame:
                frame_extra_info["done"].append(raw_done)
                frame_extra_info["reward"].append(copy.deepcopy(reward_total))
                frame_extra_info["step"].append(steps)

                # data required for calculating period.
                # we observe the channel 7 and 12 which represent the speed
                # of the knee joints.
                # This only hold for BipedalWalker-v2.
                if env_name in ENV_NAME_PERIOD_FEATURE_LOOKUP.keys():
                    assert obs.ndim == 1
                    period_feature = ENV_NAME_PERIOD_FEATURE_LOOKUP[env_name]
                    frame_extra_info["period_info"].append(obs[period_feature])

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            kwargs = {"mode": render_mode if require_full_frame else "cropped"}
            # This copy() is really important!
            # Otherwise see error: pyarrow.lib.ArrowInvalid
            if require_frame:
                frame = env.render(**kwargs).copy()
                frames.append(frame)
            if require_trajectory:
                trajectory.append([obs, action, next_obs, reward, done])
            if require_env_state:
                env_states.append(copy.deepcopy(env.get_state_wrap()))
            steps += 1
            obs = next_obs
        logging.info("Episode reward", reward_total)
        result = {}
        if require_frame:
            result['frames'] = np.stack(frames)
            result['frame_extra_info'] = frame_extra_info
        if require_trajectory:
            result['trajectory'] = trajectory
        if require_extra_info:
            extra_info_dict = {k: [] for k in extra_infos[0].keys()}
            for item in extra_infos:
                for k, v in item.items():
                    extra_info_dict[k].append(v)
            result["extra_info"] = extra_info_dict
        if require_env_state:
            result['env_states'] = env_states

        result_list.append(result)

    if num_rollouts == 1:
        return result_list[0]
    return result_list
def run_rollout(env,
                agent,
                multiagent,
                use_lstm,
                policy_agent_mapping,
                state_init,
                action_init,
                num_rollouts,
                render,
                adv_num=None):

    rewards = []
    step_nums = []

    # actually do the rollout
    for r_itr in range(num_rollouts):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        if adv_num:
            env.curr_adversary = adv_num
        obs = env.reset()
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        step_num = 0
        while not done:
            step_num += 1
            if adv_num is not None:
                multi_obs = {
                    'agent': obs['agent'],
                    'adversary{}'.format(adv_num): obs['agent']
                }
            else:
                multi_obs = {
                    'agent': obs['agent']
                } if multiagent else {
                    _DUMMY_AGENT_ID: obs
                }
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        prev_action = _flatten_action(prev_actions[agent_id])
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_action,
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        prev_action = _flatten_action(prev_actions[agent_id])
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_action,
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    # handle the tuple case
                    if len(a_action) > 1:
                        if isinstance(a_action[0], np.ndarray):
                            a_action[0] = a_action[0].flatten()
                    action_dict[agent_id] = a_action
                    prev_action = _flatten_action(a_action)  # tuple actions
                    prev_actions[agent_id] = prev_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]

            if adv_num is not None:
                action = {
                    'agent': action['agent'],
                    'adversary0': action['adversary{}'.format(adv_num)]
                }

            # we turn the adversaries off so you only send in the pendulum keys
            next_obs, reward, done, info = env.step(action)
            if render:
                env.render()
            if isinstance(done, dict):
                done = done['__all__']
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            # we only want the robot reward, not the adversary reward
            reward_total += info['agent']['agent_reward']
            obs = next_obs
        print("Episode reward", reward_total)

        rewards.append(reward_total)
        step_nums.append(step_num)

    env.close()

    print('the average reward is ', np.mean(rewards))
    return rewards, step_num
Esempio n. 17
0
def _process_observations(base_env, policies, batch_builder_pool,
                          active_episodes, unfiltered_obs, rewards, dones,
                          infos, off_policy_actions, horizon, preprocessors,
                          obs_filters, unroll_length, pack, callbacks):
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        global _large_batch_warned
        if (not _large_batch_warned and
                episode.batch_builder.total() > max(1000, unroll_length * 10)):
            _large_batch_warned = True
            logger.warning(
                "More than {} observations for {} env steps ".format(
                    episode.batch_builder.total(),
                    episode.batch_builder.count) + "are buffered in "
                "the sampler. If this is more than you expected, check that "
                "that you set a horizon on your environment correctly. Note "
                "that in multi-agent environments, `sample_batch_size` sets "
                "the batch size based on environment steps, not the steps of "
                "individual agents, which can result in unexpectedly large "
                "batches.")

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            all_done = True
            atari_metrics = _fetch_atari_metrics(base_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics))
        else:
            all_done = False
            active_envs.add(env_id)

        # For each agent in the environment
        for agent_id, raw_obs in agent_obs.items():
            policy_id = episode.policy_for(agent_id)
            prep_obs = _get_or_raise(preprocessors,
                                     policy_id).transform(raw_obs)
            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   infos[env_id].get(agent_id, {}),
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)
            episode._set_last_raw_obs(agent_id, raw_obs)
            episode._set_last_info(agent_id, infos[env_id].get(agent_id, {}))

            # Record transition info if applicable
            if (last_observation is not None and infos[env_id].get(
                    agent_id, {}).get("training_enabled", True)):
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=agent_done,
                    infos=infos[env_id].get(agent_id, {}),
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

        # Invoke the step callback after the step is logged to the episode
        if callbacks.get("on_episode_step"):
            callbacks["on_episode_step"]({"env": base_env, "episode": episode})

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_data():
            if dones[env_id]["__all__"]:
                episode.batch_builder.check_missing_dones()
            if (all_done and not pack) or \
                    episode.batch_builder.count >= unroll_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            if callbacks.get("on_episode_end"):
                callbacks["on_episode_end"]({
                    "env": base_env,
                    "policy": policies,
                    "episode": episode
                })
            del active_episodes[env_id]
            resetted_obs = base_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            else:
                # Creates a new episode
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():
                    policy_id = episode.policy_for(agent_id)
                    policy = _get_or_raise(policies, policy_id)
                    prep_obs = _get_or_raise(preprocessors,
                                             policy_id).transform(raw_obs)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(prep_obs)
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs,
                            episode.last_info_for(agent_id) or {},
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

    return active_envs, to_eval, outputs
Esempio n. 18
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            monitor=False):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            #p: flatten_to_single_ndarray(m.action_space.sample()) # ray 0.8.5
            p: _flatten_action(m.action_space.sample()) # ray 0.8.4
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if monitor and not no_render and saver and saver.outfile is not None:
        # If monitoring has been requested,
        # manually wrap our environment with a gym monitor
        # which is set to record every episode.
        env = gym.wrappers.Monitor(
            env, os.path.join(os.path.dirname(saver.outfile), "monitor"),
            lambda x: True)

    steps = 0
    episodes = 0
    simulation_rewards = []
    simulation_rewards_normalized = []
    simulation_percentage_complete = []
    simulation_steps = []

    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0

        episode_steps = 0
        episode_max_steps = 0
        episode_num_agents = 0
        agents_score = collections.defaultdict(lambda: 0.)
        agents_done = set()

        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    #a_action = flatten_to_single_ndarray(a_action)  # ray 0.8.5
                    a_action = _flatten_action(a_action)  # tuple actions # ray 0.8.4
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs

            for agent_id, agent_info in info.items():
                if episode_max_steps == 0:
                    episode_max_steps = agent_info["max_episode_steps"]
                    episode_num_agents = agent_info["num_agents"]
                episode_steps = max(episode_steps, agent_info["agent_step"])
                agents_score[agent_id] = agent_info["agent_score"]
                if agent_info["agent_done"]:
                    agents_done.add(agent_id)

        episode_score = sum(agents_score.values())
        simulation_rewards.append(episode_score)
        simulation_rewards_normalized.append(episode_score / (episode_max_steps + episode_num_agents))
        simulation_percentage_complete.append(float(len(agents_done)) / episode_num_agents)
        simulation_steps.append(episode_steps)

        saver.end_rollout()
        print(f"Episode #{episodes}: "
              f"score: {episode_score:.2f} "
              f"({np.mean(simulation_rewards):.2f}), "
              f"normalized score: {simulation_rewards_normalized[-1]:.2f} "
              f"({np.mean(simulation_rewards_normalized):.2f}), "
              f"percentage_complete: {simulation_percentage_complete[-1]:.2f} "
              f"({np.mean(simulation_percentage_complete):.2f})")
        if done:
            episodes += 1

    print("Evaluation completed:\n"
          f"Episodes: {episodes}\n"
          f"Mean Reward: {np.round(np.mean(simulation_rewards))}\n"
          f"Mean Normalized Reward: {np.round(np.mean(simulation_rewards_normalized))}\n"
          f"Mean Percentage Complete: {np.round(np.mean(simulation_percentage_complete), 3)}\n"
          f"Mean Steps: {np.round(np.mean(simulation_steps), 2)}")

    return {
        'reward': [float(r) for r in simulation_rewards],
        'reward_mean': np.mean(simulation_rewards),
        'reward_std': np.std(simulation_rewards),
        'normalized_reward': [float(r) for r in simulation_rewards_normalized],
        'normalized_reward_mean': np.mean(simulation_rewards_normalized),
        'normalized_reward_std': np.std(simulation_rewards_normalized),
        'percentage_complete': [float(c) for c in simulation_percentage_complete],
        'percentage_complete_mean': np.mean(simulation_percentage_complete),
        'percentage_complete_std': np.std(simulation_percentage_complete),
        'steps': [float(c) for c in simulation_steps],
        'steps_mean': np.mean(simulation_steps),
        'steps_std': np.std(simulation_steps),
    }
Esempio n. 19
0
def rollout(agent, env_name, num_steps, no_render=True):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    steps = 0
    while steps < (num_steps or steps + 1):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic

        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, _ = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            steps += 1
            obs = next_obs
        print("Episode reward", reward_total)
Esempio n. 20
0
def _process_observations(async_vector_env, policies, batch_builder_pool,
                          active_episodes, unfiltered_obs, rewards, dones,
                          infos, off_policy_actions, horizon, obs_filters,
                          unroll_length, pack, callbacks):
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            all_done = True
            atari_metrics = _fetch_atari_metrics(async_vector_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics))
        else:
            all_done = False
            active_envs.add(env_id)

        # For each agent in the environment
        for agent_id, raw_obs in agent_obs.items():
            policy_id = episode.policy_for(agent_id)
            filtered_obs = _get_or_raise(obs_filters, policy_id)(raw_obs)
            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)

            # Record transition info if applicable
            if last_observation is not None and \
                    infos[env_id][agent_id].get("training_enabled", True):
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=agent_done,
                    infos=infos[env_id][agent_id],
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

        # Invoke the step callback after the step is logged to the episode
        if callbacks.get("on_episode_step"):
            callbacks["on_episode_step"]({
                "env": async_vector_env,
                "episode": episode
            })

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_data():
            if (all_done and not pack) or \
                    episode.batch_builder.count >= unroll_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            if callbacks.get("on_episode_end"):
                callbacks["on_episode_end"]({
                    "env": async_vector_env,
                    "episode": episode
                })
            del active_episodes[env_id]
            resetted_obs = async_vector_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            else:
                # Creates a new episode
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():
                    policy_id = episode.policy_for(agent_id)
                    policy = _get_or_raise(policies, policy_id)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(raw_obs)
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs,
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

    return active_envs, to_eval, outputs
def rollout(agent, env_name, num_steps, out=None, no_render=False, irfs=True, noplot=False, num_episodes=1):
    Deltas = []
    Del_irf = []
    Obs_irf = []
    
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if out is not None:
        rollouts = []
    
    episode = 0

    while episode < num_episodes:
        print(f'Episode {episode} of {num_episodes}')
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        if out is not None:
            rollout = []
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        steps = 0
        deltas = []
        
        while not done and steps < (num_steps or steps + 1):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict
            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            
            #print(f'Step: {env.local_steps}, action: {action} info: {info}')
            deltas.append([info['agent_0']['delta'],info['agent_1']['delta']])
            
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            if out is not None:
                rollout.append([obs, action, next_obs, reward, done])
            steps += 1
            obs = next_obs
        if out is not None:
            rollouts.append(rollout)
        print("Episode reward", reward_total)
        
        if noplot == False:
            plt.plot(deltas)
            plt.show()
        
            sns.kdeplot(deltas, shade=True, cbar=True, cmap='Blues')
            plt.show()
            
        # === Code for impulse response functions ===
        
        if irfs == True:
            del_irf = []
            obs_irf = []
            del_irf.append([info['agent_0']['delta'],info['agent_1']['delta']])
            obs_irf.append(obs['agent_0'])
            
            for count in range(100):
                multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
                action_dict = {}
                for agent_id, a_obs in multi_obs.items():
                    if a_obs is not None:
                        policy_id = mapping_cache.setdefault(
                            agent_id, policy_agent_mapping(agent_id))
                        p_use_lstm = use_lstm[policy_id]
                        if p_use_lstm:
                            a_action, p_state, _ = agent.compute_action(
                                a_obs,
                                state=agent_states[agent_id],
                                prev_action=prev_actions[agent_id],
                                prev_reward=prev_rewards[agent_id],
                                policy_id=policy_id)
                            agent_states[agent_id] = p_state
                        else:
                            a_action = agent.compute_action(
                                a_obs,
                                prev_action=prev_actions[agent_id],
                                prev_reward=prev_rewards[agent_id],
                                policy_id=policy_id)
                        a_action = _flatten_action(a_action)  # tuple actions
                        if count < 2 and agent_id=='agent_0':
                            action_dict[agent_id] = 0
                            prev_actions[agent_id] = 0
                        else:
                            action_dict[agent_id] = a_action
                            prev_actions[agent_id] = a_action
                action = action_dict
                
                action = action if multiagent else action[_DUMMY_AGENT_ID]
                next_obs, reward, done, info = env.step(action)
                
                del_irf.append([info['agent_0']['delta'],info['agent_1']['delta']])
                obs_irf.append(obs['agent_0'])
            
            if noplot==False:
                plt.plot(del_irf)
                plt.show()
                
                plt.plot(obs_irf)
                plt.show()
            
        episode += 1
        Deltas.append(deltas)
        Del_irf.append(del_irf)
        Obs_irf.append(obs_irf)
    
    if out is not None:
        pickle.dump(rollouts, open(out, "wb"))
    
    return Deltas, Del_irf, Obs_irf
def visualize_adversaries(rllib_config, checkpoint, grid_size, num_rollouts, outdir):
    env, agent, multiagent, use_lstm, policy_agent_mapping, state_init, action_init = \
        instantiate_rollout(rllib_config, checkpoint)

    # figure out how many adversaries you have and initialize their grids
    num_adversaries = env.num_adversaries
    adversary_grid_dict = {}
    kl_grid = np.zeros((num_adversaries, num_adversaries))
    for i in range(num_adversaries):
        adversary_str = 'adversary' + str(i)
        # each adversary grid is a map of agent action versus observation dimension
        adversary_grid = np.zeros((grid_size - 1, grid_size - 1, env.observation_space.low.shape[0], env.adv_action_space.low.shape[0])).astype(int)
        strength_grid = np.linspace(env.adv_action_space.low, env.adv_action_space.high, grid_size).T
        obs_grid = np.linspace(env.observation_space.low, env.observation_space.high, grid_size).T
        adversary_grid_dict[adversary_str] = {'grid': adversary_grid, 'action_bins': strength_grid, 'obs_bins': obs_grid,
                                              'action_list': []}

    total_steps = 0

    # env.should_render = True

    # actually do the rollout
    for r_itr in range(num_rollouts):
        print('On iteration {}'.format(r_itr))
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        obs = env.reset()
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        step_num = 0
        while not done:
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            obs = multi_obs['agent'] * env.obs_norm
            if isinstance(env.adv_observation_space, dict):
                multi_obs = {'adversary{}'.format(i): {'obs': obs, 'is_active': np.array([1])} for i in range(env.num_adversaries)}
            else:
                multi_obs = {'adversary{}'.format(i): obs for i in range(env.num_adversaries)}
            multi_obs.update({'agent': obs})
            action_dict = {}
            action_dist_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    policy = agent.get_policy(policy_id)
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        prev_action = _flatten_action(prev_actions[agent_id])
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_action,
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state

                        if isinstance(a_obs, dict):
                            flat_obs = np.concatenate([val for val in a_obs.values()])[np.newaxis, :]
                        else:
                            flat_obs = _flatten_action(a_obs)[np.newaxis, :]

                        logits, _ = policy.model.from_batch({"obs": flat_obs,
                                                             "prev_action": prev_action})
                    else:
                        if isinstance(a_obs, dict):
                            flat_obs = np.concatenate([val for val in a_obs.values()])[np.newaxis, :]
                        else:
                            flat_obs = _flatten_action(a_obs)[np.newaxis, :]
                        logits, _ = policy.model.from_batch({"obs": flat_obs})
                        prev_action = _flatten_action(prev_actions[agent_id])
                        flat_action = _flatten_action(a_obs)
                        a_action = agent.compute_action(
                            flat_action,
                            prev_action=prev_action,
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)

                    # handle the tuple case
                    if len(a_action) > 1:
                        if isinstance(a_action[0], np.ndarray):
                            a_action[0] = a_action[0].flatten()
                    action_dict[agent_id] = a_action
                    action_dist_dict[agent_id] = DiagGaussian(logits, None)
                    prev_action = _flatten_action(a_action)  # tuple actions
                    prev_actions[agent_id] = prev_action

                    # Now store the agent action in the corresponding grid
                    if agent_id != 'agent':
                        action_bins = adversary_grid_dict[agent_id]['action_bins']
                        obs_bins = adversary_grid_dict[agent_id]['obs_bins']

                        heat_map = adversary_grid_dict[agent_id]['grid']
                        for action_loop_index, action in enumerate(a_action):
                            adversary_grid_dict[agent_id]['action_list'].append(a_action[0])
                            action_index = np.digitize(action, action_bins[action_loop_index, :]) - 1
                            # digitize will set the right edge of the box to the wrong value
                            if action_index == heat_map.shape[0]:
                                action_index -= 1
                            for obs_loop_index, obs_elem in enumerate(obs):
                                obs_index = np.digitize(obs_elem, obs_bins[obs_loop_index, :]) - 1
                                if obs_index == heat_map.shape[1]:
                                    obs_index -= 1

                                heat_map[action_index, obs_index, obs_loop_index, action_loop_index] += 1

            for agent_id in multi_obs.keys():
                if agent_id != 'agent':
                    # Now iterate through the agents and compute the kl_diff

                    curr_id = int(agent_id.split('adversary')[1])
                    your_action_dist = action_dist_dict[agent_id]
                    # mean, log_std = np.split(your_logits[0], 2)
                    for i in range(num_adversaries):
                        # KL diff of something with itself is zero
                        if i == curr_id:
                            pass
                        # otherwise just compute the kl difference between the agents
                        else:
                            other_action_dist = action_dist_dict['adversary{}'.format(i)]
                            # other_mean, other_log_std = np.split(other_logits.numpy()[0], 2)
                            kl_diff = your_action_dist.kl(other_action_dist)
                            kl_grid[curr_id, i] += kl_diff

            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]

            # we turn the adversaries off so you only send in the pendulum keys
            new_dict = {}
            new_dict.update({'agent': action['agent']})
            next_obs, reward, done, info = env.step(new_dict)
            if isinstance(done, dict):
                done = done['__all__']
            step_num += 1
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            # we only want the robot reward, not the adversary reward
            reward_total += info['agent']['agent_reward']
            obs = next_obs
        total_steps += step_num

    file_path = os.path.dirname(os.path.abspath(__file__))
    output_file_path = os.path.join(file_path, outdir)
    if not os.path.exists(output_file_path):
        try:
            os.makedirs(os.path.dirname(output_file_path))
        except OSError as exc:
            if exc.errno != errno.EEXIST:
                raise

    # Plot the heatmap of the actions
    for adversary, adv_dict in adversary_grid_dict.items():
        heat_map = adv_dict['grid']
        action_bins = adv_dict['action_bins']
        obs_bins = adv_dict['obs_bins']
        action_list = adv_dict['action_list']

        plt.figure()
        sns.distplot(action_list)
        output_str = '{}/{}'.format(outdir, adversary + 'action_histogram.png')
        plt.savefig(output_str)

        # x_label, y_label = env.transform_adversary_actions(bins)
        # ax = sns.heatmap(heat_map, annot=True, fmt="d")
        xtitles = ['x', 'xdot', 'theta', 'thetadot']
        ytitles = ['ax', 'ay']
        for obs_idx in range(heat_map.shape[-2]):
            for a_idx in range(heat_map.shape[-1]):
                plt.figure()
                # increasing the row index implies moving down on the y axis
                sns.heatmap(heat_map[:, :, obs_idx, a_idx], yticklabels=np.round(action_bins[0], 1), xticklabels=np.round(obs_bins[i], 1))
                plt.ylabel(ytitles[a_idx])
                plt.xlabel(xtitles[obs_idx])
                output_str = '{}/{}'.format(outdir, adversary + 'action_heatmap_{}_{}.png'.format(xtitles[obs_idx], ytitles[a_idx]))
                plt.savefig(output_str)

    # Plot the kl difference between agents
    plt.figure()
    sns.heatmap(kl_grid / total_steps)
    output_str = '{}/{}'.format(outdir, 'kl_heatmap.png')
    plt.savefig(output_str)
Esempio n. 23
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=RolloutSaver(),
            no_render=True,
            monitor=False):
    policy_agent_mapping = default_policy_agent_mapping

    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: _flatten_action(m.action_space.sample())
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if monitor and not no_render and saver and saver.outfile is not None:
        # If monitoring has been requested,
        # manually wrap our environment with a gym monitor
        # which is set to record every episode.
        env = gym.wrappers.Monitor(
            env, os.path.join(os.path.dirname(saver.outfile), "monitor"),
            lambda x: True)

    steps = 0
    episodes = 0
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {}".format(episodes, reward_total))
        if done:
            episodes += 1
Esempio n. 24
0
def run_bottleneck(args, inflow_rate, num_trials):
    result_dir = args.result_dir if args.result_dir[-1] != '/' \
        else args.result_dir[:-1]

    config = get_rllib_pkl(result_dir)

    # Run on only one cpu for rendering purposes
    config['num_workers'] = 0

    flow_params = get_flow_params(config)

    # Determine agent and checkpoint
    config_run = config['env_config']['run'] if 'run' in config['env_config'] \
        else None
    if args.run and config_run:
        if args.run != config_run:
            print('visualizer_rllib.py: error: run argument '
                  + '\'{}\' passed in '.format(args.run)
                  + 'differs from the one stored in params.json '
                  + '\'{}\''.format(config_run))
            sys.exit(1)
    if args.run:
        agent_cls = get_agent_class(args.run)
    elif config_run:
        agent_cls = get_agent_class(config_run)
    else:
        print('visualizer_rllib.py: error: could not find flow parameter '
              '\'run\' in params.json, '
              'add argument --run to provide the algorithm or model used '
              'to train the results\n e.g. '
              'python ./visualizer_rllib.py /tmp/ray/result_dir 1 --run PPO')
        sys.exit(1)

    # if using a custom model
    if config['model']['custom_model'] == "cc_model":
        if config['model']['use_lstm']:
            ModelCatalog.register_custom_model("cc_model", CentralizedCriticModelRNN)
        else:
            ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
        from flow.agents.centralized_PPO import CCTrainer
        agent_cls = CCTrainer
    elif config['model']['custom_model'] == "GRU":
        ModelCatalog.register_custom_model("GRU", GRU)
    elif config['model']['custom_model'] == "FeedForward":
        ModelCatalog.register_custom_model("FeedForward", FeedForward)

    # If we trained by imitating
    if "imitation_weight" in config['model']['custom_options'].keys():
        from flow.agents.ImitationPPO import ImitationTrainer
        agent_cls = ImitationTrainer

    sim_params = flow_params['sim']
    sim_params.restart_instance = False
    dir_path = os.path.dirname(os.path.realpath(__file__))
    emission_path = '{0}/test_time_rollout/'.format(dir_path)
    sim_params.emission_path = emission_path if args.gen_emission else None

    # pick your rendering mode
    if args.render_mode == 'sumo_web3d':
        sim_params.num_clients = 2
        sim_params.render = False
    elif args.render_mode == 'drgb':
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
    elif args.render_mode == 'sumo_gui':
        sim_params.render = False
    elif args.render_mode == 'no_render':
        sim_params.render = False
    if args.save_render:
        sim_params.render = 'drgb'
        sim_params.pxpm = 4
        sim_params.save_render = True

    # Start the environment with the gui turned on and a path for the
    # emission file
    env_params = flow_params['env']
    # TODO(@evinitsky) remove this this is a backwards compatibility hack
    if 'life_penalty' not in env_params.additional_params.keys():
        env_params.additional_params['life_penalty'] = - 3
    if args.evaluate:
        env_params.evaluate = True

    # lower the horizon if testing
    if args.horizon:
        config['horizon'] = args.horizon
        env_params.horizon = args.horizon

    # Create and register a gym+rllib env
    create_env, env_name = make_create_env(params=flow_params, version=0)
    register_env(env_name, create_env)

    # create the agent that will be used to compute the actions
    agent = agent_cls(env=env_name, config=config)
    checkpoint = result_dir + '/checkpoint_' + args.checkpoint_num
    checkpoint = checkpoint + '/checkpoint-' + args.checkpoint_num
    agent.restore(checkpoint)

    policy_agent_mapping = default_policy_agent_mapping
    if hasattr(agent, "workers"):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
        action_init = {
            p: m.action_space.sample()
            for p, m in policy_map.items()
        }
    else:
        env = gym.make(env_name)
        multiagent = False
        use_lstm = {DEFAULT_POLICY_ID: False}

    if args.render_mode == 'sumo_gui':
        env.sim_params.render = True  # set to True after initializing agent and env

    # Simulate and collect metrics
    outflow_arr = []
    final_outflows = []
    final_inflows = []
    mean_speed = []
    std_speed = []
    mean_rewards = []
    per_agent_rew = collections.defaultdict(lambda: 0.0)

    # keep track of the last 500 points of velocity data for lane 0
    # and 1 in edge 4
    velocity_arr = []
    vel = []
    mapping_cache = {}  # in case policy_agent_mapping is stochastic

    for j in range(num_trials):
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.0)
        done = False
        reward_total = 0.0
        obs = env.reset(inflow_rate)
        k = 0
        while k < env_params.horizon and not done:
            vehicles = env.unwrapped.k.vehicle
            vel.append(np.mean(vehicles.get_speed(vehicles.get_ids())))
            # don't start recording till we have hit the warmup time
            if k >= env_params.horizon - args.end_len:
                vehs_on_four = vehicles.get_ids_by_edge('4')
                lanes = vehicles.get_lane(vehs_on_four)
                lane_dict = {veh_id: lane for veh_id, lane in
                             zip(vehs_on_four, lanes)}
                sort_by_lane = sorted(vehs_on_four,
                                      key=lambda x: lane_dict[x])
                num_zeros = lanes.count(0)
                if num_zeros > 0:
                    speed_on_zero = np.mean(vehicles.get_speed(
                        sort_by_lane[0:num_zeros]))
                else:
                    speed_on_zero = 0.0
                if num_zeros < len(vehs_on_four):
                    speed_on_one = np.mean(vehicles.get_speed(
                        sort_by_lane[num_zeros:]))
                else:
                    speed_on_one = 0.0
                velocity_arr.append(
                    [inflow_rate,
                     speed_on_zero,
                     speed_on_one])
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                        print(agent_id, a_action)
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
                    print(agent_id, a_action)
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            # print(action)
            next_obs, reward, done, _ = env.step(action)

            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
                    per_agent_rew[agent_id] += r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            k += 1
            obs = next_obs

        vehicles = env.unwrapped.k.vehicle
        outflow = vehicles.get_outflow_rate(500)
        final_outflows.append(outflow)
        inflow = vehicles.get_inflow_rate(500)
        final_inflows.append(inflow)
        outflow_arr.append([inflow_rate, outflow, outflow/inflow_rate])
        mean_speed.append(np.mean(vel))
        std_speed.append(np.std(vel))
        mean_rewards.append([inflow, np.mean(list(per_agent_rew.values()))])
    return [outflow_arr, velocity_arr, mean_speed, std_speed, mean_rewards]
Esempio n. 25
0
def _process_observations(base_env, policies, batch_builder_pool,
                          active_episodes, unfiltered_obs, rewards, dones,
                          infos, off_policy_actions, horizon, preprocessors,
                          obs_filters, unroll_length, pack, callbacks):
    """Record new data from the environment and prepare for policy evaluation.

    Returns:
        active_envs: set of non-terminated env ids
        to_eval: map of policy_id to list of agent PolicyEvalData
        outputs: list of metrics and samples to return from the sampler
    """

    active_envs = set()
    to_eval = defaultdict(list)
    outputs = []

    # For each environment
    for env_id, agent_obs in unfiltered_obs.items():
        new_episode = env_id not in active_episodes
        episode = active_episodes[env_id]
        if not new_episode:
            episode.length += 1
            episode.batch_builder.count += 1
            episode._add_agent_rewards(rewards[env_id])

        if (episode.batch_builder.total() > max(1000, unroll_length * 10)
                and log_once("large_batch_warning")):
            logger.warning(
                "More than {} observations for {} env steps ".format(
                    episode.batch_builder.total(),
                    episode.batch_builder.count) + "are buffered in "
                "the sampler. If this is more than you expected, check that "
                "that you set a horizon on your environment correctly. Note "
                "that in multi-agent environments, `sample_batch_size` sets "
                "the batch size based on environment steps, not the steps of "
                "individual agents, which can result in unexpectedly large "
                "batches.")

        # Check episode termination conditions
        if dones[env_id]["__all__"] or episode.length >= horizon:
            all_done = True
            atari_metrics = _fetch_atari_metrics(base_env)
            if atari_metrics is not None:
                for m in atari_metrics:
                    outputs.append(
                        m._replace(custom_metrics=episode.custom_metrics))
            else:
                outputs.append(
                    RolloutMetrics(episode.length, episode.total_reward,
                                   dict(episode.agent_rewards),
                                   episode.custom_metrics))
        else:
            all_done = False
            active_envs.add(env_id)

        # For each agent in the environment
        for agent_id, raw_obs in agent_obs.items():
            policy_id = episode.policy_for(agent_id)
            prep_obs = _get_or_raise(preprocessors,
                                     policy_id).transform(raw_obs)
            if log_once("prep_obs"):
                logger.info("Preprocessed obs: {}".format(summarize(prep_obs)))

            filtered_obs = _get_or_raise(obs_filters, policy_id)(prep_obs)
            if log_once("filtered_obs"):
                logger.info("Filtered obs: {}".format(summarize(filtered_obs)))

            agent_done = bool(all_done or dones[env_id].get(agent_id))
            if not agent_done:
                to_eval[policy_id].append(
                    PolicyEvalData(env_id, agent_id, filtered_obs,
                                   infos[env_id].get(agent_id, {}),
                                   episode.rnn_state_for(agent_id),
                                   episode.last_action_for(agent_id),
                                   rewards[env_id][agent_id] or 0.0))

            last_observation = episode.last_observation_for(agent_id)
            episode._set_last_observation(agent_id, filtered_obs)
            episode._set_last_raw_obs(agent_id, raw_obs)
            episode._set_last_info(agent_id, infos[env_id].get(agent_id, {}))

            # Record transition info if applicable
            if (last_observation is not None and infos[env_id].get(
                    agent_id, {}).get("training_enabled", True)):
                episode.batch_builder.add_values(
                    agent_id,
                    policy_id,
                    t=episode.length - 1,
                    eps_id=episode.episode_id,
                    agent_index=episode._agent_index(agent_id),
                    obs=last_observation,
                    actions=episode.last_action_for(agent_id),
                    rewards=rewards[env_id][agent_id],
                    prev_actions=episode.prev_action_for(agent_id),
                    prev_rewards=episode.prev_reward_for(agent_id),
                    dones=agent_done,
                    infos=infos[env_id].get(agent_id, {}),
                    new_obs=filtered_obs,
                    **episode.last_pi_info_for(agent_id))

        # Invoke the step callback after the step is logged to the episode
        if callbacks.get("on_episode_step"):
            callbacks["on_episode_step"]({"env": base_env, "episode": episode})

        # Cut the batch if we're not packing multiple episodes into one,
        # or if we've exceeded the requested batch size.
        if episode.batch_builder.has_pending_data():
            if dones[env_id]["__all__"]:
                episode.batch_builder.check_missing_dones()
            if (all_done and not pack) or \
                    episode.batch_builder.count >= unroll_length:
                outputs.append(episode.batch_builder.build_and_reset(episode))
            elif all_done:
                # Make sure postprocessor stays within one episode
                episode.batch_builder.postprocess_batch_so_far(episode)

        if all_done:
            # Handle episode termination
            batch_builder_pool.append(episode.batch_builder)
            if callbacks.get("on_episode_end"):
                callbacks["on_episode_end"]({
                    "env": base_env,
                    "policy": policies,
                    "episode": episode
                })
            del active_episodes[env_id]
            resetted_obs = base_env.try_reset(env_id)
            if resetted_obs is None:
                # Reset not supported, drop this env from the ready list
                if horizon != float("inf"):
                    raise ValueError(
                        "Setting episode horizon requires reset() support "
                        "from the environment.")
            else:
                # Creates a new episode
                episode = active_episodes[env_id]
                for agent_id, raw_obs in resetted_obs.items():
                    policy_id = episode.policy_for(agent_id)
                    policy = _get_or_raise(policies, policy_id)
                    prep_obs = _get_or_raise(preprocessors,
                                             policy_id).transform(raw_obs)
                    filtered_obs = _get_or_raise(obs_filters,
                                                 policy_id)(prep_obs)
                    episode._set_last_observation(agent_id, filtered_obs)
                    to_eval[policy_id].append(
                        PolicyEvalData(
                            env_id, agent_id, filtered_obs,
                            episode.last_info_for(agent_id) or {},
                            episode.rnn_state_for(agent_id),
                            np.zeros_like(
                                _flatten_action(policy.action_space.sample())),
                            0.0))

    return active_envs, to_eval, outputs
Esempio n. 26
0
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            video_dir=None,
            config=None,
            level_seed=None):
    policy_agent_mapping = default_policy_agent_mapping
    env, multiagent, policy_map, use_lstm, state_init = get_env(
        agent,
        env_name,
        config,
        level_seed=0 if level_seed is None else level_seed,
    )
    action_init = {
        p: _flatten_action(m.action_space.sample())
        for p, m in policy_map.items()
    }

    vis_info = collections.defaultdict(list)
    steps = 0
    episodes = 0
    all_ep_total_reward = 0
    seeds = []
    while keep_going(steps, num_steps, episodes, num_episodes):
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        env, multiagent, policy_map, use_lstm, _ = get_env(
            agent,
            env_name,
            config,
            level_seed=episodes if level_seed is None else level_seed,
        )
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        episode_steps = 0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items():
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                    a_action = _flatten_action(a_action)  # tuple actions
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)

            if done:
                seeds.append(info["level_seed"])
                print(seeds)

            if hasattr(agent.workers.local_worker().get_policy().model,
                       "object_masks"):
                obj_masks = agent.workers.local_worker().get_policy(
                ).model.object_masks().cpu().numpy()
                vis_info["obj_masks"].append(obj_masks)
            vis_info["obs"].append(obs)
            vis_info["reward"].append(reward)

            episode_steps += 1
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            env.render()
            steps += 1
            obs = next_obs
        print("Episode #{}: reward: {} steps: {}".format(
            episodes, reward_total, episode_steps))
        all_ep_total_reward += reward_total
        if done:
            episodes += 1

    print(f"Average episode reward: {all_ep_total_reward / episodes:.4f}")
    return vis_info