Exemple #1
0
    def __init__(self, env,
                 eval_id,
                 fetch_parameter,
                 session_config,
                 separate_plots=False):
        """
        Display "reward" and "step_per_s" curves on Tensorboard

        Args:
            env:
            eval_id:
            fetch_parameter: lambda function that pulls from parameter server
            session_config: to construct AgentTensorplex
            - interval: log to Tensorplex every N episodes.
            - average_episodes: average rewards/speed over the last N episodes
            separate_plots: True to put reward plot in a separate section on
                Tensorboard, False to put all plots together
        """
        super().__init__(env)
        self.tensorplex = get_tensorplex_client(
            '{}/{}'.format('eval', eval_id),
            session_config
        )
        interval = session_config['tensorplex']['update_schedule']['eval_env']
        self._periodic = PeriodicTracker(interval)
        self._avg = interval
        self._separate_plots = separate_plots
        self._throttle_sleep = \
            session_config['tensorplex']['update_schedule']['eval_env_sleep']
        self._fetch_parameter = fetch_parameter
        self._fetch_parameter()  # if this eval is late to the party
Exemple #2
0
    def __init__(self, env,
                 agent_id,
                 session_config,
                 separate_plots=True):
        """
        Display "reward" and "step_per_s" curves on Tensorboard

        Args:
            env:
            agent_id: int.
            session_config: to construct AgentTensorplex
            - interval: log to Tensorplex every N episodes.
            - average_episodes: average rewards/speed over the last N episodes
            separate_plots: True to put reward plot in a separate section on
                Tensorboard, False to put all plots together
        """
        super().__init__(env)
        U.assert_type(agent_id, int)
        self.tensorplex = get_tensorplex_client(
            '{}/{}'.format('agent', agent_id),
            session_config
        )
        interval = session_config['tensorplex']['update_schedule']['training_env']
        self._periodic = PeriodicTracker(interval)
        self._avg = interval
        self._separate_plots = separate_plots
Exemple #3
0
 def __init__(self, learner_config, env_config, session_config):
     super().__init__(learner_config, env_config, session_config)
     self.q_func, self.action_dim = build_ffqfunc(self.learner_config,
                                                  self.env_config)
     self.algo = self.learner_config.algo
     self.q_target = self.q_func.clone()
     self.optimizer = torch.optim.Adam(self.q_func.parameters(),
                                       lr=self.algo.lr,
                                       eps=1e-4)
     self.target_update_tracker = PeriodicTracker(
         period=self.algo.target_network_update_freq, )
Exemple #4
0
 def __init__(self, *,
              host,
              port,
              flush_iteration):
     """
     Args:
         flush_iteration: how many send() calls before we flush the buffer
     """
     U.assert_type(flush_iteration, int)
     self._client = ZmqSender(host=host,
                              port=port)
     self._exp_buffer = ExpBuffer()
     self._flush_tracker = PeriodicTracker(flush_iteration)
Exemple #5
0
class EvalTensorplexMonitor(EpisodeMonitor):
    def __init__(self, env,
                 eval_id,
                 fetch_parameter,
                 session_config,
                 separate_plots=False):
        """
        Display "reward" and "step_per_s" curves on Tensorboard

        Args:
            env:
            eval_id:
            fetch_parameter: lambda function that pulls from parameter server
            session_config: to construct AgentTensorplex
            - interval: log to Tensorplex every N episodes.
            - average_episodes: average rewards/speed over the last N episodes
            separate_plots: True to put reward plot in a separate section on
                Tensorboard, False to put all plots together
        """
        super().__init__(env)
        self.tensorplex = get_tensorplex_client(
            '{}/{}'.format('eval', eval_id),
            session_config
        )
        interval = session_config['tensorplex']['update_schedule']['eval_env']
        self._periodic = PeriodicTracker(interval)
        self._avg = interval
        self._separate_plots = separate_plots
        self._throttle_sleep = \
            session_config['tensorplex']['update_schedule']['eval_env_sleep']
        self._fetch_parameter = fetch_parameter
        self._fetch_parameter()  # if this eval is late to the party

    def _get_tag(self, tag):
        if self._separate_plots:
            return ':' + tag  # see Tensorplex tag semantics
        else:
            return tag

    def _step(self, action):
        ob, r, done, info = super()._step(action)
        if done and self._periodic.track_increment():
            scalar_values = {
                self._get_tag('reward'):
                    U.mean(self.episode_rewards[-self._avg:]),
                'step_per_s':
                    self.step_per_sec(self._avg),
            }
            self.tensorplex.add_scalars(
                scalar_values,
                global_step=self.num_episodes
            )
            time.sleep(self._throttle_sleep)
            self._fetch_parameter()
        return ob, r, done, info
Exemple #6
0
 def __init__(self, env,
              update_interval=10,
              average_over=10,
              extra_rows=None):
     """
     Args:
         update_interval: print every N episodes
         average_over: average rewards/speed over the last N episodes
         extra_rows: an OrderedDict {'row caption': function(total_steps, num_episodes)}
             to generate extra rows to the printed table.
     """
     super().__init__(env)
     self._periodic = PeriodicTracker(update_interval)
     self._avg = average_over
     if extra_rows is None:
         self._extra_rows = OrderedDict()
     else:
         assert isinstance(extra_rows, OrderedDict), \
             'extra_rows spec {"row caption": function(total_steps, ' \
             'num_episodes)} must be an OrderedDict'
         self._extra_rows = extra_rows
Exemple #7
0
class TrainingTensorplexMonitor(EpisodeMonitor):
    def __init__(self, env,
                 agent_id,
                 session_config,
                 separate_plots=True):
        """
        Display "reward" and "step_per_s" curves on Tensorboard

        Args:
            env:
            agent_id: int.
            session_config: to construct AgentTensorplex
            - interval: log to Tensorplex every N episodes.
            - average_episodes: average rewards/speed over the last N episodes
            separate_plots: True to put reward plot in a separate section on
                Tensorboard, False to put all plots together
        """
        super().__init__(env)
        U.assert_type(agent_id, int)
        self.tensorplex = get_tensorplex_client(
            '{}/{}'.format('agent', agent_id),
            session_config
        )
        interval = session_config['tensorplex']['update_schedule']['training_env']
        self._periodic = PeriodicTracker(interval)
        self._avg = interval
        self._separate_plots = separate_plots

    def _get_tag(self, tag):
        if self._separate_plots:
            return ':' + tag  # see Tensorplex tag semantics
        else:
            return tag

    def _step(self, action):
        ob, r, done, info = super()._step(action)
        if done and self._periodic.track_increment():
            scalar_values = {
                self._get_tag('reward'):
                    U.mean(self.episode_rewards[-self._avg:]),
                'step_per_s':
                    self.step_per_sec(self._avg),
            }
            self.tensorplex.add_scalars(
                scalar_values,
                global_step=self.num_episodes
            )
        return ob, r, done, info
Exemple #8
0
class ConsoleMonitor(EpisodeMonitor):
    def __init__(self, env,
                 update_interval=10,
                 average_over=10,
                 extra_rows=None):
        """
        Args:
            update_interval: print every N episodes
            average_over: average rewards/speed over the last N episodes
            extra_rows: an OrderedDict {'row caption': function(total_steps, num_episodes)}
                to generate extra rows to the printed table.
        """
        super().__init__(env)
        self._periodic = PeriodicTracker(update_interval)
        self._avg = average_over
        if extra_rows is None:
            self._extra_rows = OrderedDict()
        else:
            assert isinstance(extra_rows, OrderedDict), \
                'extra_rows spec {"row caption": function(total_steps, ' \
                'num_episodes)} must be an OrderedDict'
            self._extra_rows = extra_rows

    def _step(self, action):
        ob, r, done, info = super()._step(action)
        if done and self._periodic.track_increment():
            info_table = []
            avg_reward = U.mean(self.episode_rewards[-self._avg:])
            info_table.append(['Last {} rewards'.format(self._avg),
                               U.fformat(avg_reward, 3)])
            avg_speed = self.step_per_sec(self._avg)
            info_table.append(['Speed iter/s',
                               U.fformat(avg_speed, 1)])
            info_table.append(['Total steps', self.total_steps])
            info_table.append(['Episodes', self.num_episodes])
            for row_caption, row_func in self._extra_rows.items():
                row_value = row_func(self.total_steps, self.num_episodes)
                info_table.append([row_caption, str(row_value)])
            # `fancy_grid` doesn't work in terminal that doesn't display unicode
            print(tabulate(info_table, tablefmt='simple', numalign='left'))
        return ob, r, done, info
Exemple #9
0
class ExpSender(object):
    """
    `send()` logic can be overwritten to support
    more complicated agent experiences,
    such as multiagent, self-play, etc.
    """
    def __init__(self, *,
                 host,
                 port,
                 flush_iteration):
        """
        Args:
            flush_iteration: how many send() calls before we flush the buffer
        """
        U.assert_type(flush_iteration, int)
        self._client = ZmqSender(host=host,
                                 port=port)
        self._exp_buffer = ExpBuffer()
        self._flush_tracker = PeriodicTracker(flush_iteration)

    def send(self, hash_dict, nonhash_dict):
        """
        Args:
            hash_dict: Large/Heavy data that should be deduplicated
                       by the caching mekanism
            nonhash_dict: Small data that we can afford to keep copies of
        """
        self._exp_buffer.add(
            hash_dict=hash_dict,
            nonhash_dict=nonhash_dict,
        )
        if self._flush_tracker.track_increment():
            exp_binary = self._exp_buffer.flush()
            self._client.send(exp_binary)
            return U.binary_hash(exp_binary)
        else:
            return None
Exemple #10
0
 def _setup_parameter_pull(self):
     self._fetch_parameter_mode = self.session_config.agent.fetch_parameter_mode
     self._fetch_parameter_interval = self.session_config.agent.fetch_parameter_interval
     self._fetch_parameter_tracker = PeriodicTracker(
         self._fetch_parameter_interval)
Exemple #11
0
class Agent(object, metaclass=U.AutoInitializeMeta):
    """
        Important: When extending this class, make sure to follow the init method signature so that 
        orchestrating functions can properly initialize custom agents.

        TODO: Extend the initilization to allow custom non-config per-agent settings.
            To be used to have a heterogeneous agent population
    """
    def __init__(self,
                 learner_config,
                 env_config,
                 session_config,
                 agent_id,
                 agent_mode,
                 render=False):
        """
            Initialize the agent class, 
        """
        self.learner_config = learner_config
        self.env_config = env_config
        self.session_config = session_config

        assert agent_mode in AGENT_MODES
        self.agent_mode = agent_mode
        self.agent_id = agent_id

        if self.agent_mode not in [
                'eval_deterministic_local', 'eval_stochastic_local'
        ]:
            self._setup_parameter_pull()
            self._setup_logging()

        self.current_episode = 0
        self.cumulative_steps = 0
        self.current_step = 0

        self.actions_since_param_update = 0
        self.episodes_since_param_update = 0

        self.render = render

    #######
    # Internal initialization methods
    #######
    def _initialize(self):
        """
            implements AutoInitializeMeta meta class.
            self.module_dict can only happen after the module is constructed by subclasses.
        """
        if self.agent_mode not in [
                'eval_deterministic_local', 'eval_stochastic_local'
        ]:
            host, port = os.environ['SYMPH_PS_FRONTEND_HOST'], os.environ[
                'SYMPH_PS_FRONTEND_PORT']
            self._module_dict = self.module_dict()
            if not isinstance(self._module_dict, ModuleDict):
                self._module_dict = ModuleDict(self._module_dict)
            self._ps_client = ParameterClient(
                host=host,
                port=port,
            )

    def _setup_parameter_pull(self):
        self._fetch_parameter_mode = self.session_config.agent.fetch_parameter_mode
        self._fetch_parameter_interval = self.session_config.agent.fetch_parameter_interval
        self._fetch_parameter_tracker = PeriodicTracker(
            self._fetch_parameter_interval)

    def _setup_logging(self):
        """
            Creates tensorplex logger and loggerplex logger
            Initializes bookkeeping values
        """
        if self.agent_mode == 'training':
            logger_name = 'agent-{}'.format(self.agent_id)
            self.tensorplex = self._get_tensorplex('{}/{}'.format(
                'agent', self.agent_id))
        else:
            logger_name = 'eval-{}'.format(self.agent_id)
            self.tensorplex = self._get_tensorplex('{}/{}'.format(
                'eval', self.agent_id))

        self.log = get_loggerplex_client(logger_name, self.session_config)
        # record how long the current parameter have been used
        self.actions_since_param_update = 0
        self.episodes_since_param_update = 0
        # Weighted Average over ~100 parameter updates.
        self.actions_per_param_update = U.MovingAverageRecorder(decay=0.99)
        self.episodes_per_param_update = U.MovingAverageRecorder(decay=0.99)

    def _get_tensorplex(self, name):
        """
            Get the periodic tensorplex object
        Args:
            @name: The name of the collection of metrics
        """
        tp = get_tensorplex_client(name, self.session_config)
        periodic_tp = PeriodicTensorplex(
            tensorplex=tp,
            period=self.session_config.tensorplex.update_schedule.agent,
            is_average=True,
            keep_full_history=False)
        return periodic_tp

    #######
    # Exposed abstract methods
    # Override in subclass, no need to call super().act etc.
    # Enough for basic usage
    #######
    def act(self, obs):
        """
        Abstract method for taking actions.
        You should check `self.agent_mode` in the function and change act()
        logic with respect to training VS evaluation.

        Args:
            obs: typically a single obs, make sure to vectorize it first before
                passing to the torch `model`.

        Returns:
            action to be executed in the env
        """
        raise NotImplementedError

    def module_dict(self):
        """
        Returns:
            a dict of name -> surreal.utils.pytorch.Module
        """
        raise NotImplementedError

    #######
    # Advanced exposed methods
    # Override in subclass, NEED to call super().on_parameter_fetched() etc.
    # User need to take care of agent mode
    # For advanced usage
    #######
    def on_parameter_fetched(self, params, info):
        """
            Called when a new parameter is fetched.
        """
        if self.agent_mode == 'training':
            # The time it takes for parameter to go from learner to agent
            delay = time.time() - info['time']
            self.actions_per_param_update.add_value(
                self.actions_since_param_update)
            self.episodes_per_param_update.add_value(
                self.episodes_since_param_update)
            self.tensorplex.add_scalars({
                '.core/parameter_publish_delay_s':
                delay,
                '.core/actions_per_param_update':
                self.actions_per_param_update.cur_value(),
                '.core/episodes_per_param_update':
                self.episodes_per_param_update.cur_value()
            })
            self.actions_since_param_update = 0
            self.episodes_since_param_update = 0
        return params

    def pre_action(self, obs):
        """
            Called before act is called by agent main script
        """
        if self.agent_mode == 'training':
            if self._fetch_parameter_mode == 'step' and \
                    self._fetch_parameter_tracker.track_increment():
                self.fetch_parameter()

    def post_action(self, obs, action, obs_next, reward, done, info):
        """
            Called after act is called by agent main script
        """
        self.current_step += 1
        self.cumulative_steps += 1
        if self.agent_mode == 'training':
            self.actions_since_param_update += 1
            if done:
                self.episodes_since_param_update += 1

    def pre_episode(self):
        """
            Called by agent process.
            Can beused to reset internal states before an episode starts
        """
        if self.agent_mode == 'training':
            if self._fetch_parameter_mode == 'episode' and \
                    self._fetch_parameter_tracker.track_increment():
                self.fetch_parameter()

    def post_episode(self):
        """
            Called by agent process.
            Can beused to reset internal states after an episode ends
            I.e. after the post_action when done = True
        """
        self.current_episode += 1

    #######
    # Main loops.
    # Customize this to fully customize the agent process
    #######
    def main(self):
        """
            Default Main loop
        Args:
            @env: the environment to run agent on
        """
        self.main_setup()
        while True:
            self.main_loop()

    def main_setup(self):
        """
            Setup before constant looping
        """
        env = self.get_env()
        env = self.prepare_env(env)
        self.env = env
        if self.agent_mode == "training":
            self.fetch_parameter()

    def main_loop(self):
        """
            One loop of agent, runs one episode of the environment
        """
        env = self.env
        self.pre_episode()
        obs, info = env.reset()
        total_reward = 0.0
        while True:
            if self.render:
                env.unwrapped.render(
                )  # TODO: figure out why it needs to be unwrapped
            self.pre_action(obs)
            action = self.act(obs)
            obs_next, reward, done, info = env.step(action)
            total_reward += reward
            self.post_action(obs, action, obs_next, reward, done, info)
            obs = obs_next
            if done:
                break
        self.post_episode()

        if self.agent_mode in [
                'eval_deterministic_local', 'eval_stochastic_local'
        ]:
            return

        if self.current_episode % 20 == 0:
            self.log.info('Episode {} reward {}'.format(
                self.current_episode, total_reward))

    def get_env(self):
        """
        Returns a subclass of EnvBase, created from self.env_config
        """
        if self.agent_mode in ['eval_deterministic', 'eval_stochastic']:
            env, _ = make_env(self.env_config, mode='eval')
        else:
            env, _ = make_env(self.env_config)
        return env

    def prepare_env(self, env):
        """
            Applies custom wrapper to the environment as necessary
        Args:
            @env: subclass of EnvBse

        Returns:
            @env: The (possibly wrapped) environment
        """
        if self.agent_mode == 'training':
            return self.prepare_env_agent(env)
        else:
            return self.prepare_env_eval(env)

    def prepare_env_agent(self, env):
        """
            Applies custom wrapper to the environment as necessary
            Only changes agent behavior
        """
        # This has to go first as it alters step() return value
        limit_episode_length = self.env_config.limit_episode_length
        if limit_episode_length > 0:
            env = MaxStepWrapper(env, limit_episode_length)
        env = TrainingTensorplexMonitor(env,
                                        agent_id=self.agent_id,
                                        session_config=self.session_config,
                                        separate_plots=True)
        return env

    def prepare_env_eval(self, env):
        """
            Applies custom wrapper to the environment as necessary
            Only changes eval behavior
        """
        limit_episode_length = self.env_config.limit_episode_length
        if limit_episode_length > 0:
            env = MaxStepWrapper(env, limit_episode_length)

        if self.agent_mode not in [
                'eval_deterministic_local', 'eval_stochastic_local'
        ]:
            env = EvalTensorplexMonitor(
                env,
                eval_id=self.agent_id,
                fetch_parameter=self.fetch_parameter,
                session_config=self.session_config,
            )

        env_category = self.env_config.env_name.split(':')[0]
        if self.env_config.video.record_video and self.agent_id == 0:
            # gym video recording not supported due to bug in OpenAI gym
            # https://github.com/openai/gym/issues/1050
            env = VideoWrapper(env, self.env_config, self.session_config)
        return env

    def main_agent(self):
        """
            Main loop ran by the agent script
            Override if you want to customize agent behavior completely
        """
        self.main()

    def main_eval(self):
        """
            Main loop ran by the eval script
            Override if you want to customize eval behavior completely
        """
        self.main()

    #######
    # Exposed public methods
    #######
    def fetch_parameter(self):
        """
            Extends base class fetch_parameters to add some logging
        """
        params, info = self._ps_client.fetch_parameter_with_info()
        if params:
            params = U.deserialize(params)
            params = self.on_parameter_fetched(params, info)
            self._module_dict.load(params)

    def fetch_parameter_info(self):
        """
            Fetch information about the parameters currently held by the parameter server
        """
        return self._ps_client.fetch_info()

    def set_agent_mode(self, agent_mode):
        """
        Args:
            agent_mode: 'training', 'eval_deterministic', or 'eval_stochastic'
        """
        assert agent_mode in AGENT_MODES
        self.agent_mode = agent_mode
Exemple #12
0
class DQNLearner(Learner):
    def __init__(self, learner_config, env_config, session_config):
        super().__init__(learner_config, env_config, session_config)
        self.q_func, self.action_dim = build_ffqfunc(self.learner_config,
                                                     self.env_config)
        self.algo = self.learner_config.algo
        self.q_target = self.q_func.clone()
        self.optimizer = torch.optim.Adam(self.q_func.parameters(),
                                          lr=self.algo.lr,
                                          eps=1e-4)
        self.target_update_tracker = PeriodicTracker(
            period=self.algo.target_network_update_freq, )

    def _update_target(self):
        self.q_target.copy_from(self.q_func)

    def _run_optimizer(self, loss):
        self.optimizer.zero_grad()
        loss.backward()
        norm_clip = self.algo.grad_norm_clipping
        if norm_clip is not None:
            self.q_func.clip_grad_norm(norm_clip)
            # torch.nn.utils.net_clip_grad_norm(
            #     self.q_func.parameters(),
            #     max_norm=norm_clip
            # )
        self.optimizer.step()

    def _optimize(self, obs, actions, rewards, obs_next, dones, weights):
        # Compute Q(s_t, a)
        # columns of actions taken
        batch_size = obs.size(0)
        assert (U.shape(actions) == U.shape(rewards) == U.shape(dones) ==
                (batch_size, 1))
        q_t_at_action = self.q_func(obs).gather(1, actions)
        q_tp1 = self.q_target(obs_next)
        # Double Q
        if self.algo.double_q:
            # select argmax action using online weights instead of q_target
            q_tp1_online = self.q_func(obs_next)
            q_tp1_online_argmax = q_tp1_online.max(1, keepdim=True)[1]
            q_tp1_best = q_tp1.gather(1, q_tp1_online_argmax)
        else:
            # Minh 2015 Nature paper
            # use target network for both policy and value selection
            q_tp1_best = q_tp1.max(1, keepdim=True)[0]
        # Q value for terminal states are 0
        q_tp1_best = (1.0 - dones) * q_tp1_best
        # .detach() stops gradient and makes the Variable forget its creator
        q_tp1_best = q_tp1_best.detach()
        # RHS of bellman equation
        q_expected = rewards + self.algo.gamma * q_tp1_best
        td_error = q_t_at_action - q_expected
        # torch_where
        raw_loss = U.huber_loss_per_element(td_error)
        weighted_loss = torch.mean(weights * raw_loss)
        self._run_optimizer(weighted_loss)
        return td_error

    def learn(self, batch_exp):
        weights = (U.torch_ones_like(batch_exp.rewards))
        td_errors = self._optimize(
            batch_exp.obs,
            batch_exp.actions,
            batch_exp.rewards,
            batch_exp.obs_next,
            batch_exp.dones,
            weights,
        )
        batch_size = batch_exp.obs.size(0)
        if self.target_update_tracker.track_increment(batch_size):
            # Update target network periodically.
            self._update_target()
        mean_td_error = U.to_scalar(torch.mean(torch.abs(td_errors)))
        self.tensorplex.add_scalars({'td_error': mean_td_error})

    def default_config(self):
        return {
            'model': {
                'convs': '_list_',
                'fc_hidden_sizes': '_list_',
                'dueling': '_bool_'
            },
            'algo': {
                'lr': 1e-3,
                'optimizer': 'Adam',
                'grad_norm_clipping': 10,
                'gamma': .99,
                'target_network_update_freq': '_int_',
                'double_q': True,
                'exploration': {
                    'schedule': 'linear',
                    'steps': '_int_',
                    'final_eps': 0.01,
                },
                'prioritized': {
                    'enabled': False,
                    'alpha': 0.6,
                    'beta0': 0.4,
                    'beta_anneal_iters': None,
                    'eps': 1e-6
                },
            },
        }

    def module_dict(self):
        return {'q_func': self.q_func}

    """