コード例 #1
0
class DQN(OffPolicyRLModel):
    """
    The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the
            end of the training. If you do not wish to restore the best version
            at the end of the training set this variable to None.
    :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary
            directory.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 train_freq=1,
                 batch_size=32,
                 checkpoint_freq=10000,
                 checkpoint_path=None,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=DQNPolicy,
                                  requires_vec_env=False)

        self.checkpoint_path = checkpoint_path
        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.checkpoint_freq = checkpoint_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None
        self.episode_reward = None

        if _init_setup_model:
            self.setup_model()

    def setup_model(self):
        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=self.policy,
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess)
                self.proba_step = self.step_model.proba_step
                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN"):
        with SetVerbosity(self.verbose), TensorboardWriter(
                self.graph, self.tensorboard_log, tb_log_name) as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(
                        prioritized_replay_beta_iters,
                        initial_p=self.prioritized_replay_beta0,
                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            for step in range(total_timesteps):
                if callback is not None:
                    callback(locals(), globals())
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer, step)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + step) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata,
                                                    'step%d' % step)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, step)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation,
                                                 deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self, observation, state=None, mask=None):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions_proba = actions_proba[0]

        return actions_proba

    def save(self, save_path):
        # params
        data = {
            "checkpoint_path": self.checkpoint_path,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "checkpoint_freq": self.checkpoint_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action
        }

        params = self.sess.run(self.params)

        self._save_to_file(save_path, data=data, params=params)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=env, _init_setup_model=False)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.params, params):
            restores.append(param.assign(loaded_p))
        model.sess.run(restores)

        return model
コード例 #2
0
ファイル: dqn.py プロジェクト: ondrejba/stable-baselines
class DQN(OffPolicyRLModel):
    """
    The DQN model class.
    DQN paper: https://arxiv.org/abs/1312.5602
    Dueling DQN: https://arxiv.org/abs/1511.06581
    Double-Q Learning: https://arxiv.org/abs/1509.06461
    Prioritized Experience Replay: https://arxiv.org/abs/1511.05952

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param exploration_initial_eps: (float) initial value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param double_q: (bool) Whether to enable Double-Q learning or not.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self, policy, env, gamma=0.99, learning_rate=5e-4, buffer_size=50000, exploration_fraction=0.1,
                 exploration_final_eps=0.02, exploration_initial_eps=1.0, train_freq=1, batch_size=32, double_q=True,
                 learning_starts=1000, target_network_update_freq=500, prioritized_replay=False,
                 prioritized_replay_alpha=0.6, prioritized_replay_beta0=0.4, prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6, param_noise=False,
                 n_cpu_tf_sess=None, verbose=0, tensorboard_log=None,
                 _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None,
                 use_rmsprop=False, rmsprop_alpha=0.95, rmsprop_epsilon=0.01, exploration_offset=0):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy, env=env, replay_buffer=None, verbose=verbose, policy_base=DQNPolicy,
                                  requires_vec_env=False, policy_kwargs=policy_kwargs, seed=seed, n_cpu_tf_sess=n_cpu_tf_sess)

        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log
        self.double_q = double_q
        self.use_rmsprop = use_rmsprop
        self.rmsprop_alpha = rmsprop_alpha
        self.rmsprop_epsilon = rmsprop_epsilon
        self.exploration_offset = exploration_offset

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None
        self.episode_reward = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess, graph=self.graph)

                if self.use_rmsprop:
                    optimizer = tf.train.RMSPropOptimizer(
                        learning_rate=self.learning_rate, decay=self.rmsprop_alpha, epsilon=self.rmsprop_epsilon,
                        centered=True
                    )
                else:
                    optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q
                )
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self, total_timesteps, callback=None, log_interval=100, tb_log_name="DQN",
              reset_num_timesteps=True, replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                    initial_p=self.prioritized_replay_beta0,
                                                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps),
                                              initial_p=self.exploration_initial_eps,
                                              final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1,))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps - self.exploration_offset)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps - self.exploration_offset) +
                                self.exploration.value(self.num_timesteps - self.exploration_offset) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs['update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
                                                                      self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(self.batch_size,
                                                               beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess, options=run_options,
                                                                  run_metadata=run_metadata)
                            writer.add_run_metadata(run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1,
                                                                  dones, weights, sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer, PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate", np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                    logger.record_tabular("% time spent exploring",
                                          int(100 * self.exploration.value(self.num_timesteps - self.exploration_offset)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return episode_rewards

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation, deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self, observation, state=None, mask=None, actions=None, logp=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1,))
            assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
            actions_proba = actions_proba[0]

        return actions_proba

    def get_parameter_list(self):
        return self.params

    def save(self, save_path, cloudpickle=False):
        # params
        data = {
            "double_q": self.double_q,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters": self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path, data=data, params=params_to_save, cloudpickle=cloudpickle)
コード例 #3
0
class DqnAtml(DQN):
    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train_atml(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def get_actions_vec(self, actions_prims, actions_inputs, actions_mf):
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        invalid_action = np.zeros(self.embedd_matrix[0].shape[1]) - 1
        self.embedd_matrix = np.vstack([self.embedd_matrix[0], invalid_action])

        embedded_steps = self.embedd_matrix[actions_prims.astype(int)]
        actions_inputs = actions_inputs.reshape(len(actions_prims), -1)
        actions_mf = actions_mf.reshape(len(actions_prims), -1)

        concat_actions = np.concatenate(
            (embedded_steps, actions_inputs, actions_mf), axis=1)
        flatten_act = concat_actions.reshape(-1)

        return flatten_act

    def process_state_vec(self, obs, state_info):
        # transform actions representation with embeddings
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        ind1 = state_info['grid_prims_size']
        ind2 = ind1 + state_info['relations_size']
        ind3 = ind2 + state_info['ff_state_size']
        ind4 = ind3 + state_info['action_prims']
        ind5 = ind4 + state_info['action_inputs']
        ind6 = ind5 + state_info['action_mf']
        cells_num = state_info['cells_num']

        actions_prims = obs[ind3:ind4]
        actions_inputs = obs[ind4:ind5]
        actions_mf = obs[ind5:]
        flatten_act = self.get_actions_vec(actions_prims, actions_inputs,
                                           actions_mf)
        final_obs = np.concatenate((obs[:ind3], flatten_act))

        return final_obs

    def hierarchical_step(self, obs, ds_rewards, cnt, kwargs, update_eps):
        register = False
        while not register:
            with self.sess.as_default():
                action = self.predict(np.array(obs)[None])[0][0]
            env_action = action
            reset = False
            new_obs, rew, done, info = self.env.step(env_action)
            level = info.get('hier_level')
            register = info.get('register')

            self.actions_container.append(env_action)
            self.actions_weights.append(level)

            if rew < 0 or register:
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                if rew < 0 and not register:
                    self.actions_container = self.actions_container[:-1]
                    self.actions_weights = self.actions_weights[:-1]
                    rep_action = np.zeros(self.action_space.n)
                    rep_action[action] = 1.0

                if register:
                    if rew > 0:
                        ds_rewards.append([cnt, rew])
                        cnt += 1
                    self.actions_container = np.array(self.actions_container)
                    self.actions_weights = np.array(
                        self.actions_weights) / level
                    b = np.zeros(
                        (len(self.actions_container), self.action_space.n))
                    b[np.arange(len(self.actions_container)),
                      self.actions_container.astype(int)] = 1
                    act_replay = np.sum((self.actions_weights * b.T).T, axis=0)
                    rep_action = act_replay / np.sum(act_replay)
                    self.actions_container = []
                    self.actions_weights = []
                self.replay_buffer.add(obs, rep_action, rew, new_obs,
                                       float(done))
                break
            obs = new_obs
        obs = new_obs
        return obs, new_obs, rew, action, done, reset

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              initial_p=1.0):

        self.actions_weights = []
        self.actions_container = []

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        cnt = 0
        ds_rewards = [[0, 0]]
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=initial_p,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()

            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                ''' Hierarchical Step (Start) '''

                obs, new_obs, rew, action, done, reset = self.hierarchical_step(
                    obs, ds_rewards, cnt, kwargs, update_eps)
                ''' Hierarchical Step (End) '''

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        try:
                            new_priorities = np.array([
                                abs(x) for x in td_errors.tolist()
                            ]) + self.prioritized_replay_eps
                            self.replay_buffer.update_priorities(
                                batch_idxes, new_priorities)
                        except AssertionError:
                            print(td_errors)

                if self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1
        return self, ds_rewards
コード例 #4
0
class MADQN(OffPolicyRLModel):
    """
    The DQN model class.
    DQN paper: https://arxiv.org/abs/1312.5602
    Dueling DQN: https://arxiv.org/abs/1511.06581
    Double-Q Learning: https://arxiv.org/abs/1509.06461
    Prioritized Experience Replay: https://arxiv.org/abs/1511.05952

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param exploration_initial_eps: (float) initial value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param double_q: (bool) Whether to enable Double-Q learning or not.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 exploration_initial_eps=1.0,
                 train_freq=1,
                 batch_size=32,
                 double_q=True,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 n_cpu_tf_sess=None,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 num_agents=1):  # MA-MOD

        # TODO: replay_buffer refactoring
        super(MADQN, self).__init__(policy=policy,
                                    env=env,
                                    replay_buffer=None,
                                    verbose=verbose,
                                    policy_base=DQNPolicy,
                                    requires_vec_env=False,
                                    policy_kwargs=policy_kwargs,
                                    seed=seed,
                                    n_cpu_tf_sess=n_cpu_tf_sess)
        # print("POLICY TYPE", policy)
        if self.observation_space:
            obs_sp_low = self.observation_space.low[0, :]
            obs_sp_high = self.observation_space.high[0, :]
            self.observation_space = gym.spaces.Box(low=obs_sp_low,
                                                    high=obs_sp_high)

        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log
        self.double_q = double_q
        self.num_agents = num_agents

        self.graph = None
        self.sess = None
        self._train_step = []  # MA-MOD
        self.step_model = []  # MA-MOD
        self.update_target = []  # MA-MOD
        self.act = []  # MA-MOD
        self.proba_step = []  # MA-MOD
        self.replay_buffer = None  # TODO: Possibly try seperate replay buffer. If everything symmetric, OK for one.
        # If you have the same Value function, its fine. If you have seperate functions, if you have one replay buffer, they learn from the same data.
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        assert False, "MAKE SURE THIS FUNCTION ISNT CALLED"
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    def setup_model(self):

        with SetVerbosity(self.verbose):
            for i in range(self.num_agents):
                assert not isinstance(self.action_space, gym.spaces.Box), \
                    "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            # print(test_policy.type)
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)
                self.params = []

                print("AC SPC", self.action_space)
                for i in range(self.num_agents):
                    with tf.variable_scope("agent" + str(i)):
                        optimizer = tf.train.AdamOptimizer(
                            learning_rate=self.learning_rate)
                        act, _train_step, update_target, step_model = build_train(
                            q_func=partial(self.policy, **self.policy_kwargs),
                            ob_space=self.observation_space,
                            ac_space=self.action_space,
                            optimizer=optimizer,
                            gamma=self.gamma,
                            grad_norm_clipping=10,
                            param_noise=self.param_noise,
                            sess=self.sess,
                            full_tensorboard_log=
                            False,  #self.full_tensorboard_log,
                            double_q=self.double_q)
                        self.act.append(act)
                        self._train_step.append(_train_step)
                        self.step_model.append(step_model)
                        self.proba_step.append(step_model.proba_step)
                        self.update_target.append(update_target)
                        self.params.extend(
                            tf_util.get_trainable_vars("agent" + str(i) +
                                                       "/deepq"))

                print(self.params)

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(
                    self.sess
                )  # TODO: copy this file, make two versions of the algorithm.
                for i in range(self.num_agents):
                    self.update_target[i](
                        sess=self.sess
                    )  # TODO: Not sure, seems like the best thing to do is try using each agents own target first.

                # self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        # callback = self._init_callback(callback)

        # with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
        #         as writer:
        self._setup_learn()

        # Create the replay buffer
        if self.prioritized_replay:
            self.replay_buffer = PrioritizedReplayBuffer(
                self.buffer_size, alpha=self.prioritized_replay_alpha)
            if self.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = total_timesteps
            else:
                prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.buffer_size)
            self.beta_schedule = None

        if replay_wrapper is not None:
            assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        # Create the schedule for exploration starting from 1.
        self.exploration = LinearSchedule(
            schedule_timesteps=int(self.exploration_fraction *
                                   total_timesteps),
            initial_p=self.exploration_initial_eps,
            final_p=self.exploration_final_eps)

        episode_rewards = [[0.0] * self.num_agents]  #MA-MOD
        episode_successes = []

        #callback.on_training_start(locals(), globals())
        #callback.on_rollout_start()

        reset = True
        obs = self.env.reset()

        for _ in range(total_timesteps):
            # Take action and update exploration to the newest value
            kwargs = {}
            if not self.param_noise:
                update_eps = self.exploration.value(self.num_timesteps)
                update_param_noise_threshold = 0.
            else:
                update_eps = 0.
                # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                # for detailed explanation.
                update_param_noise_threshold = \
                    -np.log(1. - self.exploration.value(self.num_timesteps) +
                            self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                kwargs['reset'] = reset
                kwargs[
                    'update_param_noise_threshold'] = update_param_noise_threshold
                kwargs['update_param_noise_scale'] = True

            with self.sess.as_default():
                env_action = []  # MA-MOD
                for i in range(self.num_agents
                               ):  # MA-MOD. This is fine for one policy.
                    action = self.act[i](
                        np.array(obs[i])[None],
                        update_eps=update_eps,
                        **kwargs
                    )[0]  # TODO: Is this the correct way to get the correct agent obs?
                    env_action.append(action)
            reset = False
            new_obs, rew, done, info = self.env.step(
                env_action
            )  # NOUPDATE - env.step should take a vector of actions
            '''
            Obs: x_me, x_opp --- agent 1. In env: x_1, x_2
            Obs: x_me, x_opp -- agent 2. In env: x_2, x_1
            Env: (n_agents, state_dim)
            '''

            self.num_timesteps += 1

            # Stop training if return value is False
            # if callback.on_step() is False:
            #    break

            # Store transition in the replay buffer.
            # Loop for replay buffer -- either separate or joined. obs[agent_index], action[agent_index], reward[agent_index]
            # Joey: Does this look right to you?
            # print(obs, action, rew, new_obs, done)
            #print("obs",obs[0])
            #print(action)
            #print("ac", action[0])
            #print("rew", rew[0])
            #print("done", done[0])
            for num_agent in range(self.num_agents):
                self.replay_buffer.add(obs[num_agent], env_action[num_agent],
                                       rew[num_agent], new_obs[num_agent],
                                       float(done[num_agent]))
            obs = new_obs

            # if writer is not None:
            #     ep_rew = np.array([rew]).reshape((1, -1))
            #     ep_done = np.array([done]).reshape((1, -1))
            #     tf_util.total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer,
            #                                         self.num_timesteps)

            # TODO: current episode_rewards is a list, make it a list of lists where each list is the reward for each agent in all timesteps
            #     append the newest reward to the end of each list for each agent
            for num_agent in range(self.num_agents):  #MA-MOD
                episode_rewards[-1][num_agent] += rew[num_agent]
                if done.any():
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append([0.0] * self.num_agents)
                    reset = True

            # Do not train if the warmup phase is not over
            # or if there are not enough samples in the replay buffer
            can_sample = self.replay_buffer.can_sample(self.batch_size)
            if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:

                # callback.on_rollout_end()

                for i in range(self.num_agents):  # MA-MOD
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                                "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    # if writer is not None:
                    #     # run loss backprop with summary, but once every 100 steps save the metadata
                    #     # (memory, compute time, ...)
                    #     if (1 + self.num_timesteps) % 100 == 0:
                    #         run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    #         run_metadata = tf.RunMetadata()
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess, options=run_options,
                    #                                               run_metadata=run_metadata)
                    #         writer.add_run_metadata(run_metadata, 'step%d_agent%d' % (self.num_timesteps, i))
                    #     else:
                    #         summary, td_errors = self._train_step[i](obses_t, actions, rewards, obses_tp1, obses_tp1,
                    #                                               dones, weights, sess=self.sess)
                    #     writer.add_summary(summary, self.num_timesteps)
                    # else:
                    td_errors = self._train_step[i](obses_t,
                                                    actions,
                                                    rewards,
                                                    obses_tp1,
                                                    obses_tp1,
                                                    dones,
                                                    weights,
                                                    sess=self.sess)

                if self.prioritized_replay:  # NOUPDATE - not inside main agent for loop
                    new_priorities = np.abs(
                        td_errors) + self.prioritized_replay_eps  # NOUPDATE
                    assert isinstance(self.replay_buffer,
                                      PrioritizedReplayBuffer)
                    self.replay_buffer.update_priorities(
                        batch_idxes, new_priorities)

                # callback.on_rollout_start()

            if can_sample and self.num_timesteps > self.learning_starts and \
                    self.num_timesteps % self.target_network_update_freq == 0:
                # Update target network periodically.
                for i in range(self.num_agents):
                    self.update_target[i](sess=self.sess)  # MA-MOD

            if len(episode_rewards[-101:-1]) == 0:  # MA-MOD
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(
                    float(np.mean(episode_rewards[-101:-1])), 1)  #MA-MOD

            # below is what's logged in terminal.
            num_episodes = len(episode_rewards)  #MA-MOD
            if self.verbose >= 1 and done.any(
            ) and log_interval is not None and len(
                    episode_rewards) % log_interval == 0:  #MA-MOD
                logger.record_tabular("steps", self.num_timesteps)
                logger.record_tabular("episodes", num_episodes)
                if len(episode_successes) > 0:
                    logger.logkv("success rate",
                                 np.mean(episode_successes[-100:]))
                logger.record_tabular("mean 100 episode reward",
                                      mean_100ep_reward)
                logger.record_tabular(
                    "% time spent exploring",
                    int(100 * self.exploration.value(self.num_timesteps)))
                logger.dump_tabular()

        return self

    def predict(
            self,
            observation,
            agent_idx,
            state=None,
            mask=None,
            deterministic=True):  # MA-MOD - added `agent_idx` as a parameter
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        with self.sess.as_default():
            actions, _, _ = self.step_model[agent_idx].step(
                observation, deterministic=deterministic)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    # No one ever calls this, so we don't need it?
    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        print("Should not be called")
        return None
        '''
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(observation, self.observation_space)

        observation = observation.reshape((-1,) + self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1,))
            assert observation.shape[0] == actions.shape[0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError("Error: The environment must be vectorized when using recurrent policies.")
            actions_proba = actions_proba[0]

        return actions_proba
        '''

    def get_parameter_list(self):
        print(self.params)
        return self.params

    def save(self, save_path, cloudpickle=False):
        # params
        data = {
            "double_q": self.double_q,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs,
            "num_agents": self.num_agents
        }

        params_to_save = self.get_parameters()
        # print(params_to_save)

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
コード例 #5
0
class DQNUniqueActions(DQN):
    """Subclass with a modified "learn" method which only allows actions
    to be taken once per episode.
    """
    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True

            ############################################################
            # MODIFICATION:
            # Track list of actions taken each episode. This is
            # intentionally not a set so that we can use np.isin.
            action_list = list()
            ############################################################

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    ####################################################
                    # MODIFICATION:
                    # Rename variable from original, since it's now
                    # going to come back as an array due to the
                    # modified build_act function being used to
                    # construct everything.
                    action_arr = self.act(np.array(obs)[None],
                                          update_eps=update_eps,
                                          **kwargs)[0]
                    ####################################################
                    # ORIGINAL:
                    # action = self.act(np.array(obs)[None], update_eps=update_eps, **kwargs)[0]

                ########################################################
                # MODIFICATION:
                # Get the best action that has not yet been taken this
                # episode.
                action = \
                    action_arr[np.argmin(np.isin(action_arr, action_list))]
                # Add this action to the list.
                action_list.append(action)
                ########################################################

                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    total_episode_reward_logger(self.episode_reward, ep_rew,
                                                ep_done, writer,
                                                self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    ####################################################
                    # MODIFICATION:
                    # Clear the list.
                    action_list.clear()
                    ####################################################
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1

        return self
コード例 #6
0
ファイル: DDQNatml.py プロジェクト: abirbhy/AutoML
class DqnAtml(DQN):
    def get_actions_vec(self, actions_prims, actions_inputs, actions_mf):
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        invalid_action = np.zeros(self.embedd_matrix[0].shape[1]) - 1
        self.embedd_matrix = np.vstack([self.embedd_matrix[0], invalid_action])

        embedded_steps = self.embedd_matrix[actions_prims.astype(int)]
        actions_inputs = actions_inputs.reshape(len(actions_prims), -1)
        actions_mf = actions_mf.reshape(len(actions_prims), -1)

        concat_actions = np.concatenate(
            (embedded_steps, actions_inputs, actions_mf), axis=1)
        flatten_act = concat_actions.reshape(-1)

        return flatten_act

    def process_state_vec(self, obs, state_info):
        # transform actions representation with embeddings
        with self.sess.as_default():
            self.embedd_matrix = self.step_model.embedding.get_weights()
        ind1 = state_info['grid_prims_size']
        ind2 = ind1 + state_info['relations_size']
        ind3 = ind2 + state_info['ff_state_size']
        ind4 = ind3 + state_info['action_prims']
        ind5 = ind4 + state_info['action_inputs']
        ind6 = ind5 + state_info['action_mf']
        cells_num = state_info['cells_num']

        actions_prims = obs[ind3:ind4]
        actions_inputs = obs[ind4:ind5]
        actions_mf = obs[ind5:]
        flatten_act = self.get_actions_vec(actions_prims, actions_inputs,
                                           actions_mf)
        final_obs = np.concatenate((obs[:ind3], flatten_act))

        return final_obs

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              initial_p=1.0):
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        cnt = 0
        ds_rewards = [[0, 0]]
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=initial_p,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()

            reset = True
            self.episode_reward = np.zeros((1, ))

            for _ in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                ''' Hierarchical Step (Start) '''
                register = False
                while not register:
                    with self.sess.as_default():
                        action = self.predict(np.array(obs)[None])[0][0]
                    env_action = action
                    reset = False
                    new_obs, rew, done, info = self.env.step(env_action)
                    # self.env.render()

                    register = info.get('register')
                    if register:
                        if rew > 0:
                            ds_rewards.append([cnt, rew])
                            cnt += 1
                        with self.sess.as_default():
                            action = self.act(np.array(obs)[None],
                                              update_eps=update_eps,
                                              **kwargs)[0]
                        break
                    obs = new_obs

                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs
                ''' Hierarchical Step (End) '''

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if self.num_timesteps > self.learning_starts and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()

                self.num_timesteps += 1
        return self, ds_rewards
コード例 #7
0
ファイル: MyDQN.py プロジェクト: Ofrogue/clicker-learning
class MyDQN(DQN):
    def __init__(self, *args, **kwargs):

        super(MyDQN, self).__init__(*args, **kwargs)

    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = my_build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None):
        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            Globals.env = self.env
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))
            timesteps_last_log = 0
            avr_ep_len_per_log = None
            sleep = 0.045

            for _ in range(total_timesteps):

                if Globals.loading:
                    Globals.loading = False

                while Globals.pause_game:
                    pass

                if Globals.exit_learning:
                    break

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, info = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)

                if can_sample:
                    sleep = 0.035

                time.sleep(sleep)

                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                if len(episode_rewards) % log_interval == 0:
                    avr_ep_len_per_log = (self.num_timesteps -
                                          timesteps_last_log) / log_interval
                    timesteps_last_log = self.num_timesteps

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.record_tabular("avr length of last logged ep",
                                          avr_ep_len_per_log)
                    logger.dump_tabular()

                self.num_timesteps += 1
                Globals.steps -= 1

        return self

    def evaluate(self, n_episodes=2):

        logging.basicConfig(level=logging.INFO)

        id = 'BreakoutNoFrameskip-v4'
        num_env = 1
        n_stack = 4
        left_lives = 5
        seed = 0
        episodes = 0
        score = 0
        frames = 0
        frames_per_episode = list()
        scores = [list() for i in range(n_episodes)]

        env = make_atari_env(id, num_env=num_env, seed=seed)
        env = VecFrameStack(env, n_stack=n_stack)
        obs = env.reset()

        while (n_episodes - episodes) > 0:
            frames += 1
            action, _states = self.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            score += rewards[0]
            if dones:
                logging.debug('You died')
                logging.debug(f'Score = {score}')
                scores[episodes].append(score)
                score = 0
                left_lives -= 1
            if not left_lives:
                logging.debug('Episode ended')
                logging.info(f'Scores per life: {scores[episodes]}')
                frames_per_episode.append(frames)
                frames = 0
                episodes += 1
                left_lives = 5

        s = list(map(sum, scores))
        avg_s = int(sum(s) / len(s))
        avg_f = int(sum(frames_per_episode) / len(frames_per_episode))

        logging.info(f'Played {n_episodes} episodes')
        logging.info(f'Scores per episode : {s}')
        logging.info(f'Average score per episode : {avg_s}')
        logging.info(f'Average number of frames per episode : {avg_f}')

        return avg_f, avg_s
コード例 #8
0
class DeepQ(BaseRLModel):
    """
    The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf

    :param policy: (function (TensorFlow Tensor, int, str, bool): TensorFlow Tensor)
                    the policy that takes the following inputs:
                    - observation_in: (object) the output of observation placeholder
                    - num_actions: (int) number of actions
                    - scope: (str)
                    - reuse: (bool) should be passed to outer variable scope
                    and returns a tensor of shape (batch_size, num_actions) with values of every action.
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the
            end of the training. If you do not wish to restore the best version
            at the end of the training set this variable to None.
    :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary
            directory.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float) alpha parameter for prioritized replay buffer
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 train_freq=1,
                 batch_size=32,
                 checkpoint_freq=10000,
                 checkpoint_path=None,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 verbose=0,
                 _init_setup_model=True):
        super(DeepQ, self).__init__(policy=policy,
                                    env=env,
                                    requires_vec_env=False,
                                    verbose=verbose)

        assert not isinstance(policy, ActorCriticPolicy), \
            "Error: DeepQ does not support the actor critic policies, please use the " \
            "'stable_baselines.deepq.models.mlp' and 'stable_baselines.deepq.models.cnn_to_mlp' " \
            "functions to create your policies."

        self.checkpoint_path = checkpoint_path
        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.checkpoint_freq = checkpoint_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma

        self.graph = None
        self.sess = None
        self._train_step = None
        self.update_target = None
        self.act = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None

        if _init_setup_model:
            self.setup_model()

    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Discrete), \
                "Error: DeepQ cannot output a {} action space, only spaces.Discrete is supported."\
                .format(self.action_space)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                # capture the shape outside the closure so that the env object is not serialized
                # by cloudpickle when serializing make_obs_ph
                observation_space = self.observation_space

                def make_obs_ph(name):
                    """
                    makes the observation placeholder

                    :param name: (str) the placeholder name
                    :return: (TensorFlow Tensor) the placeholder
                    """
                    return ObservationInput(observation_space, name=name)

                self.act, self._train_step, self.update_target, _ = deepq.build_train(
                    make_obs_ph=make_obs_ph,
                    q_func=self.policy,
                    num_actions=self.action_space.n,
                    optimizer=tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate),
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise)

                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100):
        with SetVerbosity(self.verbose):
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                    self.beta_schedule = LinearSchedule(
                        prioritized_replay_beta_iters,
                        initial_p=self.prioritized_replay_beta0,
                        final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None
            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            obs = self.env.reset()
            reset = True

            for step in range(total_timesteps):
                if callback is not None:
                    callback(locals(), globals())
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(step)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(step) +
                                self.exploration.value(step) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    action = self.act(np.array(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs)[0]
                env_action = action
                reset = False
                new_obs, rew, done, _ = self.env.step(env_action)
                # Store transition in the replay buffer.
                self.replay_buffer.add(obs, action, rew, new_obs, float(done))
                obs = new_obs

                episode_rewards[-1] += rew
                if done:
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                if step > self.learning_starts and step % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(step))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    td_errors = self._train_step(obses_t,
                                                 actions,
                                                 rewards,
                                                 obses_tp1,
                                                 dones,
                                                 weights,
                                                 sess=self.sess)
                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if step > self.learning_starts and step % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", step)
                    logger.record_tabular("episodes", num_episodes)
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(step)))
                    logger.dump_tabular()

        return self

    def predict(self, observation, state=None, mask=None):
        observation = np.array(observation).reshape(
            self.observation_space.shape)

        with self.sess.as_default():
            action = self.act(observation[None])[0]

        if self._vectorize_action:
            return [action], [None]
        else:
            return action, None

    def action_probability(self, observation, state=None, mask=None):
        observation = np.array(observation).reshape(
            (-1, ) + self.observation_space.shape)

        # Get the tensor just before the softmax function in the TensorFlow graph,
        # then execute the graph from the input observation to this tensor.
        tensor = self.graph.get_tensor_by_name(
            'deepq/q_func/fully_connected_2/BiasAdd:0')
        if self._vectorize_action:
            return self._softmax(
                self.sess.run(tensor,
                              feed_dict={'deepq/observation:0': observation}))
        else:
            return self._softmax(
                self.sess.run(tensor,
                              feed_dict={'deepq/observation:0':
                                         observation}))[0]

    def save(self, save_path):
        # params
        data = {
            "checkpoint_path": self.checkpoint_path,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "checkpoint_freq": self.checkpoint_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action
        }

        params = self.sess.run(self.params)

        self._save_to_file(save_path, data=data, params=params)

    @classmethod
    def load(cls, load_path, env=None, **kwargs):
        data, params = cls._load_from_file(load_path)

        model = cls(policy=data["policy"], env=env, _init_setup_model=False)
        model.__dict__.update(data)
        model.__dict__.update(kwargs)
        model.set_env(env)
        model.setup_model()

        restores = []
        for param, loaded_p in zip(model.params, params):
            restores.append(param.assign(loaded_p))
        model.sess.run(restores)

        return model
コード例 #9
0
class DQN(OffPolicyRLModel):
    """
    The DQN model class.
    DQN paper: https://arxiv.org/abs/1312.5602
    Dueling DQN: https://arxiv.org/abs/1511.06581
    Double-Q Learning: https://arxiv.org/abs/1509.06461
    Prioritized Experience Replay: https://arxiv.org/abs/1511.05952

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param exploration_initial_eps: (float) initial value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param double_q: (bool) Whether to enable Double-Q learning or not.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    :param seed: (int) Seed for the pseudo-random generators (python, numpy, tensorflow).
        If None (default), use random seed. Note that if you want completely deterministic
        results, you must set `n_cpu_tf_sess` to 1.
    :param n_cpu_tf_sess: (int) The number of threads for TensorFlow operations
        If None, the number of cpu of the current machine will be used.
    """
    def _create_replay_wrapper(self, env):
        """
        Wrap the environment in a HERGoalEnvWrapper
        if needed and create the replay buffer wrapper.
        """
        if not isinstance(env, HERGoalEnvWrapper):
            env = HERGoalEnvWrapper(env)

        self.env = env
        self.n_sampled_goal = 4
        self.goal_selection_strategy = 'future'
        # NOTE: we cannot do that check directly with VecEnv
        # maybe we can try calling `compute_reward()` ?
        # assert isinstance(self.env, gym.GoalEnv), "HER only supports gym.GoalEnv"

        self.replay_wrapper = functools.partial(
            HindsightExperienceReplayWrapper,
            n_sampled_goal=self.n_sampled_goal,
            goal_selection_strategy=self.goal_selection_strategy,
            wrapped_env=self.env)

    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 exploration_initial_eps=1.0,
                 train_freq=1,
                 batch_size=32,
                 double_q=True,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 n_cpu_tf_sess=None,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_actions=2):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=DQNPolicy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs,
                                  seed=seed,
                                  n_cpu_tf_sess=n_cpu_tf_sess)

        # for HER algorithm
        if self.env is not None and 'Fetch' in self.env.__str__():
            self._create_replay_wrapper(self.env)
            self.observation_space = self.env.observation_space

        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_initial_eps = exploration_initial_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log
        self.double_q = double_q

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None

        self.n_actions = n_actions
        self.macro_len = 5
        self.macro_count = 0  # when macro_count % macro_len == 0, resample macro action

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    def setup_model(self):

        with SetVerbosity(self.verbose):
            # decision net: produce categorical distribution
            self.action_space = gym.spaces.discrete.Discrete(self.n_actions)

            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              distinct_replay_buffer=False):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        for i, m in enumerate(self.sub_models):
            m.learning_rate = get_schedule_fn(m.learning_rate)
            if len(self.replay_wrappers) != 0:
                m.replay_buffer = self.replay_wrappers[i](m.replay_buffer)
            m._setup_learn()

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn()

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=self.exploration_initial_eps,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            macro_count = 0
            macro_len = self.macro_len
            macro_choices = []
            n_updates = 0

            for step in range(total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True
                with self.sess.as_default():
                    if reset or macro_count % macro_len == 0:
                        macro_action = self.act(np.array(obs)[None],
                                                update_eps=update_eps,
                                                **kwargs)[0]
                        # macro_action = 1
                        macro_obs = obs
                        reward_in_one_macro = 0
                    macro_count += 1
                    macro_choices.append(macro_action)

                # use sub_model to decide action
                # env_action = self.sub_models[macro_action]
                current_sub = self.sub_models[macro_action]
                if self.num_timesteps < self.learning_starts or np.random.rand(
                ) < current_sub.random_exploration:
                    # actions sampled from action space are from range specific to the environment
                    # but algorithm operates on tanh-squashed actions therefore simple scaling is used
                    unscaled_action = self.env.action_space.sample()
                    action = scale_action(self.env.action_space,
                                          unscaled_action)
                else:
                    action = current_sub.policy_tf.step(
                        obs[None], deterministic=False).flatten()
                    # Add noise to the action (improve exploration,
                    # not needed in general)
                    if current_sub.action_noise is not None:
                        action = np.clip(action + current_sub.action_noise(),
                                         -1, 1)
                    # inferred actions need to be transformed to environment action_space before stepping
                    unscaled_action = unscale_action(self.env.action_space,
                                                     action)
                assert action.shape == self.env.action_space.shape

                reset = False
                new_obs, rew, done, info = self.env.step(unscaled_action)
                episode_rewards[-1] += rew
                # rew -= self.args.policy_cost_coef * self.args.sub_policy_costs[macro_action]
                reward_in_one_macro += rew - self.args.policy_cost_coef * self.args.sub_policy_costs[
                    macro_action]
                # Store transition in the replay buffer.
                if macro_count % macro_len == 0 or done:
                    self.replay_buffer.add(macro_obs, macro_action,
                                           reward_in_one_macro, new_obs,
                                           float(done))
                for i, m in enumerate(self.sub_models):
                    if distinct_replay_buffer:
                        if i == macro_action:
                            m.replay_buffer.add(obs, action, rew, new_obs,
                                                float(done))
                    else:
                        m.replay_buffer.add(obs, action, rew, new_obs,
                                            float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    total_episode_reward_logger(self.episode_reward, ep_rew,
                                                ep_done, writer,
                                                self.num_timesteps)

                # print("step: %d, done: %d" % (self.num_timesteps, done))
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True
                    macro_action = None
                    macro_count = 0
                    prev_macro_choices = macro_choices
                    macro_choices = []

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                        and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    # pytype:disable=bad-unpacking
                    if self.prioritized_replay:
                        assert self.beta_schedule is not None, \
                               "BUG: should be LinearSchedule when self.prioritized_replay True"
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None
                    # pytype:enable=bad-unpacking

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        assert isinstance(self.replay_buffer,
                                          PrioritizedReplayBuffer)
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if step % self.sub_models[0].train_freq == 0:
                    mb_infos_vals = []
                    for m in self.sub_models:
                        # Update policy, critics and target networks
                        for grad_step in range(m.gradient_steps):
                            # Break if the warmup phase is not over
                            # or if there are not enough samples in the replay buffer
                            if not m.replay_buffer.can_sample(m.batch_size) \
                               or self.num_timesteps < m.learning_starts:
                                break
                            n_updates += 1
                            # Compute current learning_rate
                            frac = 1.0 - step / total_timesteps
                            current_lr = m.learning_rate(frac)
                            # Update policy and critics (q functions)
                            mb_infos_vals.append(
                                m._train_step(step, writer, current_lr))
                            # Update target network
                            if (step +
                                    grad_step) % m.target_update_interval == 0:
                                # Update target network
                                m.sess.run(m.target_update_op)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # print(done, log_interval, len(episode_rewards), self.num_timesteps)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    prev_macro_choices = np.array(prev_macro_choices)
                    macro_choices_ratio = [
                        '%.2f' %
                        ((prev_macro_choices[prev_macro_choices == i]).size /
                         prev_macro_choices.size)
                        for i in range(self.n_actions)
                    ]
                    logger.record_tabular("macro choices", macro_choices_ratio)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.logkv("n_updates_of_sub", n_updates)
                    logger.dump_tabular()
                    print("macro choices", prev_macro_choices)

                self.num_timesteps += 1

        return self

    def eval(self,
             total_episodes,
             callback=None,
             log_interval=100,
             tb_log_name="DQN",
             reset_num_timesteps=True,
             replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            episode_rewards = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            macro_count = 0
            macro_len = self.macro_len
            macro_choices = []
            n_updates = 0
            macro_action = None

            # for step in range(total_timesteps):
            while True:
                with self.sess.as_default():
                    if reset or macro_count % macro_len == 0:
                        # macro_action = self.act(np.array(obs)[None], update_eps=0, **{})[0]
                        macro_actions, _, _ = self.step_model.step(
                            np.array(obs)[None], deterministic=False)
                        macro_action = macro_actions[0]
                        # macro_action = 1
                        macro_obs = obs
                        reward_in_one_macro = 0
                    macro_count += 1
                    macro_choices.append(macro_action)

                current_sub = self.sub_models[macro_action]
                action = current_sub.policy_tf.step(
                    obs[None], deterministic=True).flatten()
                # Add noise to the action (improve exploration,
                # not needed in general)
                if current_sub.action_noise is not None:
                    action = np.clip(action + current_sub.action_noise(), -1,
                                     1)
                # inferred actions need to be transformed to environment action_space before stepping
                unscaled_action = unscale_action(self.env.action_space, action)
                assert action.shape == self.env.action_space.shape

                reset = False
                new_obs, rew, done, info = self.env.step(unscaled_action)
                episode_rewards[-1] += rew
                rew -= self.args.policy_cost_coef * self.args.sub_policy_costs[
                    macro_action]
                reward_in_one_macro += rew
                obs = new_obs

                # print("step: %d, done: %d" % (self.num_timesteps, done))
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True
                    macro_action = None
                    macro_count = 0
                    print("=" * 70)
                    print("macro_choices:", macro_choices)
                    print("return:", episode_rewards[-2])
                    print("=" * 70)
                    prev_macro_choices = macro_choices
                    macro_choices = []
                    if len(episode_rewards) - 1 == total_episodes:
                        break

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                # print(done, log_interval, len(episode_rewards), self.num_timesteps)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("macro choices",
                                          np.mean(prev_macro_choices))
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.logkv("n_updates_of_sub", n_updates)
                    logger.dump_tabular()
                    print("macro choices", prev_macro_choices)

                self.num_timesteps += 1

        return self

    def predict(self,
                observation,
                state=None,
                mask=None,
                deterministic=True,
                args=None):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        with self.sess.as_default():
            if self.macro_count % self.macro_len == 0:
                macro_actions, _, _ = self.step_model.step(
                    observation, deterministic=deterministic)
                # macro_actions = self.act(observation, update_eps=0)
                self.macro_act = macro_actions[
                    0]  # not supporting vectorized_env
                if args.eval_certain_sub != None:
                    self.macro_act = args.eval_certain_sub
            self.macro_count += 1

        # Sample from sub_policy
        current_sub = self.sub_models[self.macro_act]
        action = current_sub.policy_tf.step(observation,
                                            deterministic=deterministic)
        action = action.reshape(
            (-1, ) +
            self.env.action_space.shape)  # reshape to the correct action shape
        # inferred actions need to be transformed to environment action_space before stepping
        unscaled_action = unscale_action(self.env.action_space, action)

        unscaled_action = unscaled_action[0]  # not supporting vectorized_env

        return self.macro_act, unscaled_action, None

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1, ))
            assert observation.shape[0] == actions.shape[
                0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions_proba = actions_proba[0]

        return actions_proba

    def get_parameter_list(self):
        return self.params

    def save(self, save_path, cloudpickle=False):
        # params
        data = {
            "double_q": self.double_q,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "n_cpu_tf_sess": self.n_cpu_tf_sess,
            "seed": self.seed,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path,
                           data=data,
                           params=params_to_save,
                           cloudpickle=cloudpickle)
コード例 #10
0
class TD3(OffPolicyRLModel):
    """
    Twin Delayed DDPG (TD3)
    Addressing Function Approximation Error in Actor-Critic Methods.

    Original implementation: https://github.com/sfujim/TD3
    Paper: https://arxiv.org/pdf/1802.09477.pdf
    Introduction to TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html

    :param policy: (TD3Policy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) the discount factor
    :param learning_rate: (float or callable) learning rate for adam optimizer,
        the same learning rate will be used for all networks (Q-Values and Actor networks)
        it can be a function of the current progress (from 1 to 0)
    :param buffer_size: (int) size of the replay buffer
    :param batch_size: (int) Minibatch size for each gradient update
    :param tau: (float) the soft update coefficient ("polyak update" of the target networks, between 0 and 1)
    :param policy_delay: (int) Policy and target networks will only be updated once every policy_delay steps
        per training steps. The Q values will be updated policy_delay more often (update every training step).
    :param action_noise: (ActionNoise) the action noise type. Cf DDPG for the different action noise type.
    :param target_policy_noise: (float) Standard deviation of gaussian noise added to target policy
        (smoothing noise)
    :param target_noise_clip: (float) Limit for absolute value of target policy smoothing noise.
    :param train_freq: (int) Update the model every `train_freq` steps.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param gradient_steps: (int) How many gradient update after each step
    :param random_exploration: (float) Probability of taking a random action (as in an epsilon-greedy strategy)
        This is not needed for TD3 normally but can help exploring when using HER + TD3.
        This hack was present in the original OpenAI Baselines repo (DDPG + HER)
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param policy_kwargs: (dict) additional arguments to be passed to the policy on creation
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        Note: this has no effect on TD3 logging for now
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=3e-4,
                 buffer_size=50000,
                 buffer_type=ReplayBuffer,
                 prioritization_starts=0,
                 beta_schedule=None,
                 learning_starts=100,
                 train_freq=100,
                 gradient_steps=100,
                 batch_size=128,
                 tau=0.005,
                 policy_delay=2,
                 action_noise=None,
                 action_l2_scale=0,
                 target_policy_noise=0.2,
                 target_noise_clip=0.5,
                 random_exploration=0.0,
                 verbose=0,
                 write_freq=1,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 time_aware=False):

        super(TD3, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  write_freq=write_freq,
                                  policy_base=TD3Policy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs)

        self.prioritization_starts = prioritization_starts
        self.beta_schedule = beta_schedule
        self.buffer_is_prioritized = buffer_type.__name__ in [
            "PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"
        ]
        self.loss_history = None
        self.buffer_type = buffer_type
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.batch_size = batch_size
        self.tau = tau
        self.gradient_steps = gradient_steps
        self.gamma = gamma
        self.action_noise = action_noise
        self.action_l2_scale = action_l2_scale
        self.random_exploration = random_exploration
        self.policy_delay = policy_delay
        self.target_noise_clip = target_noise_clip
        self.target_policy_noise = target_policy_noise

        self.time_aware = time_aware

        self.graph = None
        self.replay_buffer = None
        self.episode_reward = None
        self.sess = None
        self.tensorboard_log = tensorboard_log
        self.verbose = verbose
        self.params = None
        self.summary = None
        self.policy_tf = None
        self.full_tensorboard_log = full_tensorboard_log

        self.obs_target = None
        self.target_policy_tf = None
        self.actions_ph = None
        self.rewards_ph = None
        self.terminals_ph = None
        self.observations_ph = None
        self.action_target = None
        self.next_observations_ph = None
        self.is_weights_ph = None
        self.step_ops = None
        self.target_ops = None
        self.infos_names = None
        self.target_params = None
        self.learning_rate_ph = None
        self.processed_obs_ph = None
        self.processed_next_obs_ph = None
        self.policy_out = None
        self.policy_train_op = None
        self.policy_loss = None

        self.active_sampling = False

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.policy_tf
        # Rescale
        policy_out = self.policy_out * np.abs(self.action_space.low)
        return policy.obs_ph, self.actions_ph, policy_out

    def setup_model(self):
        with SetVerbosity(self.verbose):
            self.graph = tf.Graph()
            with self.graph.as_default():
                n_cpu = multiprocessing.cpu_count()
                if sys.platform == 'darwin':
                    n_cpu //= 2
                self.sess = tf_util.make_session(num_cpu=n_cpu,
                                                 graph=self.graph)

                self.buffer_is_prioritized = self.buffer_type.__name__ in [
                    "PrioritizedReplayBuffer", "RankPrioritizedReplayBuffer"
                ]

                if self.replay_buffer is None:
                    if self.buffer_is_prioritized:
                        if self.num_timesteps is not None and self.prioritization_starts > self.num_timesteps or self.prioritization_starts > 0:
                            self.replay_buffer = ReplayBuffer(self.buffer_size)
                        else:
                            buffer_kw = {
                                "size": self.buffer_size,
                                "alpha": 0.7
                            }
                            if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer":
                                buffer_kw.update({
                                    "learning_starts":
                                    self.prioritization_starts,
                                    "batch_size": self.batch_size
                                })
                            self.replay_buffer = self.buffer_type(**buffer_kw)
                    else:
                        self.replay_buffer = self.buffer_type(self.buffer_size)

                #self.replay_buffer = DiscrepancyReplayBuffer(self.buffer_size, scorer=self.policy_tf.get_q_discrepancy)

                with tf.variable_scope("input", reuse=False):
                    # Create policy and target TF objects
                    self.policy_tf = self.policy(self.sess,
                                                 self.observation_space,
                                                 self.action_space,
                                                 **self.policy_kwargs)
                    self.target_policy_tf = self.policy(
                        self.sess, self.observation_space, self.action_space,
                        **self.policy_kwargs)

                    # Initialize Placeholders
                    self.observations_ph = self.policy_tf.obs_ph
                    # Normalized observation for pixels
                    self.processed_obs_ph = self.policy_tf.processed_obs
                    self.next_observations_ph = self.target_policy_tf.obs_ph
                    self.processed_next_obs_ph = self.target_policy_tf.processed_obs
                    self.action_target = self.target_policy_tf.action_ph
                    self.terminals_ph = tf.placeholder(tf.float32,
                                                       shape=(None, 1),
                                                       name='terminals')
                    self.rewards_ph = tf.placeholder(tf.float32,
                                                     shape=(None, 1),
                                                     name='rewards')
                    self.actions_ph = tf.placeholder(tf.float32,
                                                     shape=(None, ) +
                                                     self.action_space.shape,
                                                     name='actions')
                    self.learning_rate_ph = tf.placeholder(
                        tf.float32, [], name="learning_rate_ph")

                with tf.variable_scope("model", reuse=False):
                    # Create the policy
                    self.policy_out = policy_out = self.policy_tf.make_actor(
                        self.processed_obs_ph)
                    self.policy_test = policy_test = self.policy_tf.make_actor(
                        self.processed_obs_ph, scope="pi_t")
                    # Use two Q-functions to improve performance by reducing overestimation bias
                    qf1, qf2 = self.policy_tf.make_critics(
                        self.processed_obs_ph, self.actions_ph)
                    # Q value when following the current policy
                    qf1_pi, qf2_pi = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_out, reuse=True)
                    qf1_pi_t, _ = self.policy_tf.make_critics(
                        self.processed_obs_ph, policy_test, reuse=True)

                with tf.variable_scope("target", reuse=False):
                    # Create target networks
                    target_policy_out = self.target_policy_tf.make_actor(
                        self.processed_next_obs_ph)
                    # Target policy smoothing, by adding clipped noise to target actions
                    target_noise = tf.random_normal(
                        tf.shape(target_policy_out),
                        stddev=self.target_policy_noise)
                    target_noise = tf.clip_by_value(target_noise,
                                                    -self.target_noise_clip,
                                                    self.target_noise_clip)
                    # Clip the noisy action to remain in the bounds [-1, 1] (output of a tanh)
                    noisy_target_action = tf.clip_by_value(
                        target_policy_out + target_noise, -1, 1)
                    # Q values when following the target policy
                    qf1_target, qf2_target = self.target_policy_tf.make_critics(
                        self.processed_next_obs_ph, noisy_target_action)

                with tf.variable_scope("loss", reuse=False):
                    # Take the min of the two target Q-Values (clipped Double-Q Learning)
                    min_qf_target = tf.minimum(qf1_target, qf2_target)

                    # Targets for Q value regression
                    q_backup = tf.stop_gradient(self.rewards_ph +
                                                (1 - self.terminals_ph) *
                                                self.gamma * min_qf_target)

                    # Compute Q-Function loss
                    if self.buffer_is_prioritized:
                        self.is_weights_ph = tf.placeholder(tf.float32,
                                                            shape=(None, 1),
                                                            name="is_weights")
                        qf1_loss = tf.reduce_mean(self.is_weights_ph *
                                                  (q_backup - qf1)**2)
                        qf2_loss = tf.reduce_mean(self.is_weights_ph *
                                                  (q_backup - qf2)**2)
                    else:
                        qf1_loss = tf.reduce_mean((q_backup - qf1)**2)
                        qf2_loss = tf.reduce_mean((q_backup - qf2)**2)

                    q_discrepancy = tf.abs(qf1_pi - qf2_pi)
                    self.q_disc_strength_ph = tf.placeholder(
                        tf.float32, [], name="q_disc_strength_ph")
                    self.q_disc_strength_schedule = ExponentialSchedule(
                        int(1e5), 30, 0, rate=10)

                    qvalues_losses = qf1_loss + qf2_loss

                    rew_loss = tf.reduce_mean(qf1_pi)
                    q_disc_loss = tf.reduce_mean(
                        q_discrepancy
                    )  #self.q_disc_strength_ph * tf.reduce_mean(q_discrepancy)
                    action_loss = self.action_l2_scale * tf.nn.l2_loss(
                        self.policy_out)

                    # Policy loss: maximise q value
                    self.policy_loss = policy_loss = -(
                        rew_loss + q_disc_loss) + action_loss
                    self.policy_loss_t = policy_loss_t = -tf.reduce_mean(
                        qf1_pi_t)

                    # Policy train op
                    # will be called only every n training steps,
                    # where n is the policy delay
                    policy_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op = policy_optimizer.minimize(
                        policy_loss, var_list=get_vars('model/pi'))
                    self.policy_train_op = policy_train_op

                    policy_optimizer_t = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    policy_train_op_t = policy_optimizer_t.minimize(
                        policy_loss_t, var_list=get_vars('model/pi_t'))
                    self.policy_train_op_t = policy_train_op_t

                    # Q Values optimizer
                    qvalues_optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate_ph)
                    qvalues_params = get_vars('model/values_fn/')

                    # Q Values and policy target params
                    source_params = get_vars("model/")
                    target_params = get_vars("target/")

                    source_params = [
                        param for param in source_params
                        if "pi_t" not in param.name
                    ]

                    # Polyak averaging for target variables
                    self.target_ops = [
                        tf.assign(target,
                                  (1 - self.tau) * target + self.tau * source)
                        for target, source in zip(target_params, source_params)
                    ]

                    # Initializing target to match source variables
                    target_init_op = [
                        tf.assign(target, source)
                        for target, source in zip(target_params, source_params)
                    ]

                    train_values_op = qvalues_optimizer.minimize(
                        qvalues_losses, var_list=qvalues_params)

                    self.infos_names = ['qf1_loss', 'qf2_loss']
                    # All ops to call during one training step
                    self.step_ops = [
                        qf1_loss, qf2_loss, qf1, qf2, train_values_op,
                        q_discrepancy
                    ]

                    # Monitor losses and entropy in tensorboard
                    tf.summary.scalar("rew_loss", rew_loss)
                    tf.summary.scalar("q_disc_loss", q_disc_loss)
                    tf.summary.scalar("action_loss", action_loss)
                    tf.summary.scalar('policy_loss', policy_loss)
                    tf.summary.scalar("policy_loss_t", policy_loss_t)
                    tf.summary.scalar('qf1_loss', qf1_loss)
                    tf.summary.scalar('qf2_loss', qf2_loss)
                    tf.summary.scalar('learning_rate',
                                      tf.reduce_mean(self.learning_rate_ph))

                # Retrieve parameters that must be saved
                self.params = get_vars("model")
                self.target_params = get_vars("target/")

                # Initialize Variables and target network
                with self.sess.as_default():
                    self.sess.run(tf.global_variables_initializer())
                    self.sess.run(target_init_op)

                self.summary = tf.summary.merge_all()

    def _train_step(self, step, writer, learning_rate, update_policy):
        # Sample a batch from the replay buffer
        if self.buffer_is_prioritized and self.num_timesteps >= self.prioritization_starts:
            batch = self.replay_buffer.sample(self.batch_size,
                                              beta=self.beta_schedule(
                                                  self.num_timesteps))
            batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones, batch_weights, batch_idxs = batch
            if len(batch_weights.shape) == 1:
                batch_weights = np.expand_dims(batch_weights, axis=1)
        else:
            batch = self.replay_buffer.sample(self.batch_size)
            batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = batch
            if self.buffer_is_prioritized:
                batch_weights = np.ones(shape=(self.batch_size, 1))

        feed_dict = {
            self.observations_ph:
            batch_obs,
            self.actions_ph:
            batch_actions,
            self.next_observations_ph:
            batch_next_obs,
            self.rewards_ph:
            batch_rewards.reshape(self.batch_size, -1),
            self.terminals_ph:
            batch_dones.reshape(self.batch_size, -1),
            self.learning_rate_ph:
            learning_rate,
            self.q_disc_strength_ph:
            self.q_disc_strength_schedule(self.num_timesteps)
        }

        if self.buffer_is_prioritized:
            feed_dict[self.is_weights_ph] = batch_weights

        step_ops = self.step_ops
        if update_policy:
            # Update policy and target networks
            step_ops = step_ops + [
                self.policy_train_op, self.policy_train_op_t, self.target_ops,
                self.policy_loss, self.policy_loss_t
            ]

        # Do one gradient step
        # and optionally compute log for tensorboard
        if writer is not None:
            out = self.sess.run([self.summary] + step_ops, feed_dict)
            summary = out.pop(0)
            writer.add_summary(summary, step)
        else:
            out = self.sess.run(step_ops, feed_dict)

        # Unpack to monitor losses
        q_discrepancies = out.pop(5)
        qf1_loss, qf2_loss, *_values = out

        if self.buffer_is_prioritized and self.num_timesteps >= self.prioritization_starts:
            if isinstance(self.replay_buffer,
                          HindsightExperienceReplayWrapper):
                self.replay_buffer.replay_buffer.update_priorities(
                    batch_idxs, q_discrepancies)
            else:
                self.replay_buffer.update_priorities(batch_idxs,
                                                     q_discrepancies)

        return qf1_loss, qf2_loss, q_discrepancies

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=4,
              tb_log_name="TD3",
              reset_num_timesteps=True,
              replay_wrapper=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)
        last_replay_update = 0

        if replay_wrapper is not None:
            self.replay_buffer = replay_wrapper(self.replay_buffer)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:

            self._setup_learn(seed)

            # Transform to callable if needed
            self.learning_rate = get_schedule_fn(self.learning_rate)
            # Initial learning rate
            current_lr = self.learning_rate(1)

            start_time = time.time()
            episode_rewards = [0.0]
            episode_successes = []
            if self.action_noise is not None:
                self.action_noise.reset()
            obs = self.env.reset()
            self.episode_reward = np.zeros((1, ))
            ep_info_buf = deque(maxlen=100)
            n_updates = 0
            infos_values = []
            self.active_sampling = False
            initial_step = self.num_timesteps

            if self.buffer_is_prioritized and \
                    ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                     or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer")) \
                    and self.num_timesteps >= self.prioritization_starts:
                self._set_prioritized_buffer()

            for step in range(initial_step, total_timesteps):
                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break

                # Before training starts, randomly sample actions
                # from a uniform distribution for better exploration.
                # Afterwards, use the learned policy
                # if random_exploration is set to 0 (normal setting)
                if (self.num_timesteps < self.learning_starts
                        or np.random.rand() < self.random_exploration):
                    # No need to rescale when sampling random action
                    rescaled_action = action = self.env.action_space.sample()
                else:
                    action = self.policy_tf.step(obs[None]).flatten()
                    # Add noise to the action, as the policy
                    # is deterministic, this is required for exploration
                    if self.action_noise is not None:
                        action = np.clip(action + self.action_noise(), -1, 1)
                    # Rescale from [-1, 1] to the correct bounds
                    rescaled_action = action * np.abs(self.action_space.low)

                assert action.shape == self.env.action_space.shape

                new_obs, reward, done, info = self.env.step(rescaled_action)

                # Store transition in the replay buffer.
                self.replay_buffer.add(
                    obs, action, reward, new_obs,
                    float(done if not self.time_aware else done
                          and info["termination"] != "steps"))
                obs = new_obs

                if ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "RankPrioritizedReplayBuffer")\
                        or self.replay_buffer.__name__ == "RankPrioritizedReplayBuffer") and \
                        self.num_timesteps % self.buffer_size == 0:
                    self.replay_buffer.rebalance()

                # Retrieve reward and episode length if using Monitor wrapper
                maybe_ep_info = info.get('episode')
                if maybe_ep_info is not None:
                    ep_info_buf.extend([maybe_ep_info])

                if writer is not None:
                    # Write reward per episode to tensorboard
                    ep_reward = np.array([reward]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_reward, ep_done, writer,
                        self.num_timesteps)

                if step % self.train_freq == 0:
                    mb_infos_vals = []
                    # Update policy, critics and target networks
                    for grad_step in range(self.gradient_steps):
                        # Break if the warmup phase is not over
                        # or if there are not enough samples in the replay buffer
                        if not self.replay_buffer.can_sample(self.batch_size) \
                                or self.num_timesteps < self.learning_starts:
                            break
                        n_updates += 1
                        # Compute current learning_rate
                        frac = 1.0 - self.num_timesteps / total_timesteps
                        current_lr = self.learning_rate(frac)
                        # Update policy and critics (q functions)
                        # Note: the policy is updated less frequently than the Q functions
                        # this is controlled by the `policy_delay` parameter
                        step_writer = writer if grad_step % self.write_freq == 0 else None
                        mb_infos_vals.append(
                            self._train_step(step, step_writer, current_lr,
                                             (step + grad_step) %
                                             self.policy_delay == 0))

                    # Log losses and entropy, useful for monitor training
                    if len(mb_infos_vals) > 0:
                        infos_values = np.mean(mb_infos_vals, axis=0)

                episode_rewards[-1] += reward
                if done:
                    if isinstance(self.replay_buffer, DiscrepancyReplayBuffer
                                  ) and n_updates - last_replay_update >= 5000:
                        self.replay_buffer.update_priorities()
                        last_replay_update = n_updates
                    if self.action_noise is not None:
                        self.action_noise.reset()
                    if not isinstance(self.env, VecEnv):
                        if self.active_sampling:
                            sample_obs, sample_state = self.env.get_random_initial_states(
                                25)
                            obs_discrepancies = self.policy_tf.get_q_discrepancy(
                                sample_obs)
                            obs = self.env.reset(
                                **sample_state[np.argmax(obs_discrepancies)])
                        else:
                            obs = self.env.reset()
                    episode_rewards.append(0.0)

                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))

                if len(episode_rewards[-101:-1]) == 0:
                    mean_reward = -np.inf
                else:
                    mean_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                self.num_timesteps += 1

                if self.buffer_is_prioritized and \
                        ((replay_wrapper is not None and self.replay_buffer.replay_buffer.__name__ == "ReplayBuffer")
                         or (replay_wrapper is None and self.replay_buffer.__name__ == "ReplayBuffer"))\
                        and self.num_timesteps >= self.prioritization_starts:
                    self._set_prioritized_buffer()

                # Display training infos
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    fps = int(step / (time.time() - start_time))
                    logger.logkv("episodes", num_episodes)
                    logger.logkv("mean 100 episode reward", mean_reward)
                    if len(ep_info_buf) > 0 and len(ep_info_buf[0]) > 0:
                        logger.logkv(
                            'ep_rewmean',
                            safe_mean(
                                [ep_info['r'] for ep_info in ep_info_buf]))
                        logger.logkv(
                            'eplenmean',
                            safe_mean(
                                [ep_info['l'] for ep_info in ep_info_buf]))
                    logger.logkv("n_updates", n_updates)
                    logger.logkv("current_lr", current_lr)
                    logger.logkv("fps", fps)
                    logger.logkv('time_elapsed', int(time.time() - start_time))
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    if len(infos_values) > 0:
                        for (name, val) in zip(self.infos_names, infos_values):
                            logger.logkv(name, val)
                    logger.logkv("total timesteps", self.num_timesteps)
                    logger.dumpkvs()
                    # Reset infos:
                    infos_values = []
            return self

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        _ = np.array(observation)

        if actions is not None:
            raise ValueError("Error: TD3 does not have action probabilities.")

        # here there are no action probabilities, as DDPG does not use a probability distribution
        warnings.warn(
            "Warning: action probability is meaningless for TD3. Returning None"
        )
        return None

    def predict(self, observation, state=None, mask=None, deterministic=True):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions = self.policy_tf.step(observation, test=deterministic)

        if self.action_noise is not None and not deterministic:
            actions = np.clip(actions + self.action_noise(), -1, 1)

        actions = actions.reshape(
            (-1, ) +
            self.action_space.shape)  # reshape to the correct action shape
        actions = actions * np.abs(
            self.action_space.low)  # scale the output for the prediction

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def _get_env(self):
        env = self.env
        env = env.env

        return env

    def _set_prioritized_buffer(self):
        buffer_kw = {"size": self.buffer_size, "alpha": 0.7}
        if self.buffer_type.__name__ == "RankPrioritizedReplayBuffer":
            buffer_kw.update({
                "learning_starts": self.prioritization_starts,
                "batch_size": self.batch_size
            })
        r_buf = self.buffer_type(**buffer_kw)

        for i, transition in enumerate(self.replay_buffer._storage):
            r_buf.add(*transition)
            r_buf.update_priorities([i],
                                    self.policy_tf.get_q_discrepancy(
                                        transition[0])[0])
        if r_buf.__name__ == "RankPrioritizedReplayBuffer":
            r_buf.rebalance()
        if isinstance(self.replay_buffer, HindsightExperienceReplayWrapper):
            self.replay_buffer.replay_buffer = r_buf
        else:
            self.replay_buffer = r_buf
        self.learning_rate = get_schedule_fn(
            self.learning_rate(1) / 4)  # TODO: will not work with non-constant
        self.beta_schedule = get_schedule_fn(self.beta_schedule)
        print("Enabled prioritized replay buffer")

    def get_parameter_list(self):
        return (self.params + self.target_params)

    def save(self, save_path):
        data = {
            "learning_rate": self.learning_rate,
            "buffer_size": self.buffer_size,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "batch_size": self.batch_size,
            "tau": self.tau,
            # Should we also store the replay buffer?
            # this may lead to high memory usage
            # with all transition inside
            "replay_buffer": self.replay_buffer,
            "policy_delay": self.policy_delay,
            "target_noise_clip": self.target_noise_clip,
            "target_policy_noise": self.target_policy_noise,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "action_noise": self.action_noise,
            "random_exploration": self.random_exploration,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs,
            "num_timesteps": self.num_timesteps
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path, data=data, params=params_to_save)
コード例 #11
0
ファイル: dqn.py プロジェクト: porterjenkins/allocation-rl
class DQN(OffPolicyRLModel):
    """
    The DQN model class. DQN paper: https://arxiv.org/pdf/1312.5602.pdf

    :param policy: (DQNPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, LnMlpPolicy, ...)
    :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
    :param gamma: (float) discount factor
    :param learning_rate: (float) learning rate for adam optimizer
    :param buffer_size: (int) size of the replay buffer
    :param exploration_fraction: (float) fraction of entire training period over which the exploration rate is
            annealed
    :param exploration_final_eps: (float) final value of random action probability
    :param train_freq: (int) update the model every `train_freq` steps. set to None to disable printing
    :param batch_size: (int) size of a batched sampled from replay buffer for training
    :param checkpoint_freq: (int) how often to save the model. This is so that the best version is restored at the
            end of the training. If you do not wish to restore the best version
            at the end of the training set this variable to None.
    :param checkpoint_path: (str) replacement path used if you need to log to somewhere else than a temporary
            directory.
    :param learning_starts: (int) how many steps of the model to collect transitions for before learning starts
    :param target_network_update_freq: (int) update the target network every `target_network_update_freq` steps.
    :param prioritized_replay: (bool) if True prioritized replay buffer will be used.
    :param prioritized_replay_alpha: (float)alpha parameter for prioritized replay buffer.
        It determines how much prioritization is used, with alpha=0 corresponding to the uniform case.
    :param prioritized_replay_beta0: (float) initial value of beta for prioritized replay buffer
    :param prioritized_replay_beta_iters: (int) number of iterations over which beta will be annealed from initial
            value to 1.0. If set to None equals to max_timesteps.
    :param prioritized_replay_eps: (float) epsilon to add to the TD errors when updating priorities.
    :param param_noise: (bool) Whether or not to apply noise to the parameters of the policy.
    :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
    :param tensorboard_log: (str) the log location for tensorboard (if None, no logging)
    :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
    :param full_tensorboard_log: (bool) enable additional logging when using tensorboard
        WARNING: this logging can take a lot of space quickly
    """
    def __init__(self,
                 policy,
                 env,
                 gamma=0.99,
                 learning_rate=5e-4,
                 buffer_size=50000,
                 exploration_fraction=0.1,
                 exploration_final_eps=0.02,
                 train_freq=1,
                 batch_size=32,
                 checkpoint_freq=10000,
                 checkpoint_path=None,
                 learning_starts=1000,
                 target_network_update_freq=500,
                 prioritized_replay=False,
                 prioritized_replay_alpha=0.6,
                 prioritized_replay_beta0=0.4,
                 prioritized_replay_beta_iters=None,
                 prioritized_replay_eps=1e-6,
                 param_noise=False,
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False):

        # TODO: replay_buffer refactoring
        super(DQN, self).__init__(policy=policy,
                                  env=env,
                                  replay_buffer=None,
                                  verbose=verbose,
                                  policy_base=DQNPolicy,
                                  requires_vec_env=False,
                                  policy_kwargs=policy_kwargs)

        self.checkpoint_path = checkpoint_path
        self.param_noise = param_noise
        self.learning_starts = learning_starts
        self.train_freq = train_freq
        self.prioritized_replay = prioritized_replay
        self.prioritized_replay_eps = prioritized_replay_eps
        self.batch_size = batch_size
        self.target_network_update_freq = target_network_update_freq
        self.checkpoint_freq = checkpoint_freq
        self.prioritized_replay_alpha = prioritized_replay_alpha
        self.prioritized_replay_beta0 = prioritized_replay_beta0
        self.prioritized_replay_beta_iters = prioritized_replay_beta_iters
        self.exploration_final_eps = exploration_final_eps
        self.exploration_fraction = exploration_fraction
        self.buffer_size = buffer_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.tensorboard_log = tensorboard_log
        self.full_tensorboard_log = full_tensorboard_log

        self.graph = None
        self.sess = None
        self._train_step = None
        self.step_model = None
        self.update_target = None
        self.act = None
        self.proba_step = None
        self.replay_buffer = None
        self.beta_schedule = None
        self.exploration = None
        self.params = None
        self.summary = None
        self.episode_reward = None

        if _init_setup_model:
            self.setup_model()

    def _get_pretrain_placeholders(self):
        policy = self.step_model
        return policy.obs_ph, tf.placeholder(tf.int32, [None]), policy.q_values

    @staticmethod
    def _is_vectorized_observation(observation, observation_space):
        """
        For every observation type, detects and validates the shape,
        then returns whether or not the observation is vectorized.

        :param observation: (np.ndarray) the input observation to validate
        :param observation_space: (gym.spaces) the observation space
        :return: (bool) whether the given observation is vectorized or not
        """
        if isinstance(observation_space, gym.spaces.Box):
            if observation.shape == observation_space.shape:
                return False
            elif observation.shape[1:] == observation_space.shape:
                return True
            else:
                raise ValueError(
                    "Error: Unexpected observation shape {} for ".format(
                        observation.shape) +
                    "Box environment, please use {} ".format(
                        observation_space.shape) +
                    "or (n_env, {}) for the observation shape.".format(
                        ", ".join(map(str, observation_space.shape))))
        elif isinstance(observation_space, gym.spaces.Discrete):
            if observation.shape == (
            ):  # A numpy array of a number, has shape empty tuple '()'
                return False
            elif len(observation.shape) == 1:
                return True
            else:
                raise ValueError(
                    "Error: Unexpected observation shape {} for ".format(
                        observation.shape) +
                    "Discrete environment, please use (1,) or (n_env, 1) for the observation shape."
                )
        elif isinstance(observation_space, gym.spaces.MultiDiscrete):
            if observation.shape == (len(observation_space.nvec), ):
                return False
            elif len(observation.shape) == 2 and observation.shape[1] == len(
                    observation_space.nvec):
                return True
            else:
                raise ValueError(
                    "Error: Unexpected observation shape {} for MultiDiscrete "
                    .format(observation.shape) +
                    "environment, please use ({},) or ".format(
                        len(observation_space.nvec)) +
                    "(n_env, {}) for the observation shape.".format(
                        len(observation_space.nvec)))
        elif isinstance(observation_space, gym.spaces.MultiBinary):
            if observation.shape == (observation_space.n, ):
                return False
            elif len(observation.shape
                     ) == 2 and observation.shape[1] == observation_space.n:
                return True
            else:
                raise ValueError(
                    "Error: Unexpected observation shape {} for MultiBinary ".
                    format(observation.shape) +
                    "environment, please use ({},) or ".format(
                        observation_space.n) +
                    "(n_env, {}) for the observation shape.".format(
                        observation_space.n))
        elif isinstance(observation_space, gym.spaces.Dict):
            import logging
            # print("the observation is the space type {} "
            #             "and the observation is in vector with shape {}.".format(observation_space, observation.shape))
            pass

        else:
            raise ValueError(
                "Error: Cannot determine if the observation is vectorized with the space type {}."
                .format(observation_space))

    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

    def learn(self,
              total_timesteps,
              callback=None,
              seed=None,
              log_interval=100,
              tb_log_name="DQN",
              reset_num_timesteps=True,
              replay_wrapper=None,
              learning_curve=False,
              test_t=None):

        new_tb_log = self._init_num_timesteps(reset_num_timesteps)

        with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \
                as writer:
            self._setup_learn(seed)

            # Create the replay buffer
            if self.prioritized_replay:
                self.replay_buffer = PrioritizedReplayBuffer(
                    self.buffer_size, alpha=self.prioritized_replay_alpha)
                if self.prioritized_replay_beta_iters is None:
                    prioritized_replay_beta_iters = total_timesteps
                else:
                    prioritized_replay_beta_iters = self.prioritized_replay_beta_iters
                self.beta_schedule = LinearSchedule(
                    prioritized_replay_beta_iters,
                    initial_p=self.prioritized_replay_beta0,
                    final_p=1.0)
            else:
                self.replay_buffer = ReplayBuffer(self.buffer_size)
                self.beta_schedule = None

            if replay_wrapper is not None:
                assert not self.prioritized_replay, "Prioritized replay buffer is not supported by HER"
                self.replay_buffer = replay_wrapper(self.replay_buffer)

            # Create the schedule for exploration starting from 1.
            self.exploration = LinearSchedule(
                schedule_timesteps=int(self.exploration_fraction *
                                       total_timesteps),
                initial_p=1.0,
                final_p=self.exploration_final_eps)

            episode_rewards = [0.0]
            self.cumul_reward = [0.0]
            episode_successes = []
            obs = self.env.reset()
            reset = True
            self.episode_reward = np.zeros((1, ))

            # variables for test eval ##
            test_step = test_t * 3
            test_results = {'sum': []}
            test_ts = []

            for _ in range(total_timesteps):

                ## Test eval period ##
                if learning_curve and _ % test_step == 0 and _ > 0:
                    print("--> Simulating test period")
                    self.env.reset()
                    test_r = 0.0
                    for i in range(test_t):
                        feasible_actions = AllocationEnv.get_feasible_actions(
                            obs["board_config"])
                        action_mask = AllocationEnv.get_action_mask(
                            feasible_actions, self.env.action_space.n)
                        action, _states = self.predict(obs, mask=action_mask)
                        action = AllocationEnv.check_action(
                            obs['board_config'], action)
                        obs, rewards, dones, info = self.env.step(action)
                        test_r += rewards

                    test_results["sum"].append(test_r)
                    test_ts.append(_)
                    self.env.reset()

                    # plot test eval progress
                    plt.plot(test_ts, test_results["sum"])
                    # plt.errorbar(iteration_cuts, results["mean"], yerr=results["std"], fmt='.k')
                    plt.xlabel("Iteration count")
                    plt.ylabel("Total (sum) test reward")
                    plt.savefig("figs/rl-learning-curve-{}.pdf".format(
                        cfg.vals['prj_name']))
                    plt.clf()
                    plt.close()

                    # write test eval progress
                    write_results = {}
                    for k, v in test_results.items():
                        write_results[k] = serialize_floats(v)

                    with open(
                            "output/rl-learning-curve-{}.json".format(
                                cfg.vals['prj_name']), 'w') as f:
                        json.dump(write_results, f)

                if callback is not None:
                    # Only stop training if return value is False, not when it is None. This is for backwards
                    # compatibility with callbacks that have no return statement.
                    if callback(locals(), globals()) is False:
                        break
                # Take action and update exploration to the newest value
                kwargs = {}
                if not self.param_noise:
                    update_eps = self.exploration.value(self.num_timesteps)
                    update_param_noise_threshold = 0.
                else:
                    update_eps = 0.
                    # Compute the threshold such that the KL divergence between perturbed and non-perturbed
                    # policy is comparable to eps-greedy exploration with eps = exploration.value(t).
                    # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017
                    # for detailed explanation.
                    update_param_noise_threshold = \
                        -np.log(1. - self.exploration.value(self.num_timesteps) +
                                self.exploration.value(self.num_timesteps) / float(self.env.action_space.n))
                    kwargs['reset'] = reset
                    kwargs[
                        'update_param_noise_threshold'] = update_param_noise_threshold
                    kwargs['update_param_noise_scale'] = True

                feasible_actions = AllocationEnv.get_feasible_actions(
                    obs["board_config"])
                action_mask = AllocationEnv.get_action_mask(
                    feasible_actions, self.action_space.n)
                with self.sess.as_default():
                    action = self.act(State.get_vec_observation(obs)[None],
                                      update_eps=update_eps,
                                      **kwargs,
                                      mask=action_mask)[0]
                reset = False
                # CHECK IF ACTIONS IS FEASIBLE
                action = AllocationEnv.check_action(obs['board_config'],
                                                    action)
                env_action = action
                new_obs, rew, done, info = self.env.step(env_action)
                print("action: {} - reward: {} - eps: {:.4}".format(
                    action, rew, update_eps))
                print(new_obs['day_vec'])
                print(new_obs['board_config'])
                # Store transition in the replay buffer.
                self.replay_buffer.add(State.get_vec_observation(obs), action,
                                       rew, State.get_vec_observation(new_obs),
                                       float(done))
                obs = new_obs

                if writer is not None:
                    ep_rew = np.array([rew]).reshape((1, -1))
                    ep_done = np.array([done]).reshape((1, -1))
                    self.episode_reward = total_episode_reward_logger(
                        self.episode_reward, ep_rew, ep_done, writer,
                        self.num_timesteps)

                episode_rewards[-1] += rew
                self.cumul_reward.append(self.cumul_reward[-1] + rew)
                if done:
                    maybe_is_success = info.get('is_success')
                    if maybe_is_success is not None:
                        episode_successes.append(float(maybe_is_success))
                    if not isinstance(self.env, VecEnv):
                        obs = self.env.reset()
                    episode_rewards.append(0.0)
                    reset = True

                # Do not train if the warmup phase is not over
                # or if there are not enough samples in the replay buffer
                can_sample = self.replay_buffer.can_sample(self.batch_size)
                if can_sample and self.num_timesteps > self.learning_starts \
                    and self.num_timesteps % self.train_freq == 0:
                    # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                    if self.prioritized_replay:
                        experience = self.replay_buffer.sample(
                            self.batch_size,
                            beta=self.beta_schedule.value(self.num_timesteps))
                        (obses_t, actions, rewards, obses_tp1, dones, weights,
                         batch_idxes) = experience
                    else:
                        obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(
                            self.batch_size)
                        weights, batch_idxes = np.ones_like(rewards), None

                    if writer is not None:
                        # run loss backprop with summary, but once every 100 steps save the metadata
                        # (memory, compute time, ...)
                        if (1 + self.num_timesteps) % 100 == 0:
                            run_options = tf.RunOptions(
                                trace_level=tf.RunOptions.FULL_TRACE)
                            run_metadata = tf.RunMetadata()
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess,
                                options=run_options,
                                run_metadata=run_metadata)
                            writer.add_run_metadata(
                                run_metadata, 'step%d' % self.num_timesteps)
                        else:
                            summary, td_errors = self._train_step(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                obses_tp1,
                                dones,
                                weights,
                                sess=self.sess)
                        writer.add_summary(summary, self.num_timesteps)
                    else:
                        _, td_errors = self._train_step(obses_t,
                                                        actions,
                                                        rewards,
                                                        obses_tp1,
                                                        obses_tp1,
                                                        dones,
                                                        weights,
                                                        sess=self.sess)

                    if self.prioritized_replay:
                        new_priorities = np.abs(
                            td_errors) + self.prioritized_replay_eps
                        self.replay_buffer.update_priorities(
                            batch_idxes, new_priorities)

                if can_sample and self.num_timesteps > self.learning_starts and \
                        self.num_timesteps % self.target_network_update_freq == 0:
                    # Update target network periodically.
                    self.update_target(sess=self.sess)

                if len(episode_rewards[-101:-1]) == 0:
                    mean_100ep_reward = -np.inf
                else:
                    mean_100ep_reward = round(
                        float(np.mean(episode_rewards[-101:-1])), 1)

                num_episodes = len(episode_rewards)
                if self.verbose >= 1 and done and log_interval is not None and len(
                        episode_rewards) % log_interval == 0:
                    logger.record_tabular("steps", self.num_timesteps)
                    logger.record_tabular("episodes", num_episodes)
                    if len(episode_successes) > 0:
                        logger.logkv("success rate",
                                     np.mean(episode_successes[-100:]))
                    logger.record_tabular("mean 100 episode reward",
                                          mean_100ep_reward)
                    logger.record_tabular(
                        "% time spent exploring",
                        int(100 * self.exploration.value(self.num_timesteps)))
                    logger.dump_tabular()
                print('timestamp: {}'.format(self.num_timesteps, end='\r\n'))
                self.num_timesteps += 1

        return self

    def predict(self, observation, state=None, mask=None, deterministic=True):
        if isinstance(observation, dict):
            observation = State.get_vec_observation(observation)[None]
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        with self.sess.as_default():
            actions, _, _ = self.step_model.step(observation,
                                                 deterministic=deterministic,
                                                 mask=mask)

        if not vectorized_env:
            actions = actions[0]

        return actions, None

    def action_probability(self,
                           observation,
                           state=None,
                           mask=None,
                           actions=None,
                           logp=False):
        observation = np.array(observation)
        vectorized_env = self._is_vectorized_observation(
            observation, self.observation_space)

        observation = observation.reshape((-1, ) +
                                          self.observation_space.shape)
        actions_proba = self.proba_step(observation, state, mask)

        if actions is not None:  # comparing the action distribution, to given actions
            actions = np.array([actions])
            assert isinstance(self.action_space, gym.spaces.Discrete)
            actions = actions.reshape((-1, ))
            assert observation.shape[0] == actions.shape[
                0], "Error: batch sizes differ for actions and observations."
            actions_proba = actions_proba[np.arange(actions.shape[0]), actions]
            # normalize action proba shape
            actions_proba = actions_proba.reshape((-1, 1))
            if logp:
                actions_proba = np.log(actions_proba)

        if not vectorized_env:
            if state is not None:
                raise ValueError(
                    "Error: The environment must be vectorized when using recurrent policies."
                )
            actions_proba = actions_proba[0]

        return actions_proba

    def get_parameter_list(self):
        return self.params

    def save(self, save_path):
        # params
        data = {
            "checkpoint_path": self.checkpoint_path,
            "param_noise": self.param_noise,
            "learning_starts": self.learning_starts,
            "train_freq": self.train_freq,
            "prioritized_replay": self.prioritized_replay,
            "prioritized_replay_eps": self.prioritized_replay_eps,
            "batch_size": self.batch_size,
            "target_network_update_freq": self.target_network_update_freq,
            "checkpoint_freq": self.checkpoint_freq,
            "prioritized_replay_alpha": self.prioritized_replay_alpha,
            "prioritized_replay_beta0": self.prioritized_replay_beta0,
            "prioritized_replay_beta_iters":
            self.prioritized_replay_beta_iters,
            "exploration_final_eps": self.exploration_final_eps,
            "exploration_fraction": self.exploration_fraction,
            "learning_rate": self.learning_rate,
            "gamma": self.gamma,
            "verbose": self.verbose,
            "observation_space": self.observation_space,
            "action_space": self.action_space,
            "policy": self.policy,
            "n_envs": self.n_envs,
            "_vectorize_action": self._vectorize_action,
            "policy_kwargs": self.policy_kwargs
        }

        params_to_save = self.get_parameters()

        self._save_to_file(save_path, data=data, params=params_to_save)

    def update_weights(self, buffer):
        obses_t, actions, rewards, obses_tp1, dones = buffer.sample(
            self.batch_size)
        weights, batch_idxes = np.ones_like(rewards), None
        _, td_errors = self._train_step(obses_t,
                                        actions,
                                        rewards,
                                        obses_tp1,
                                        obses_tp1,
                                        dones,
                                        weights,
                                        sess=self.sess)

    def learn_off_policy(self, total_timesteps, buffer):
        print("Off-policy training")
        for i in tqdm(range(total_timesteps)):
            self.update_weights(buffer)
コード例 #12
0
class PacmanPWDQN(Agent):
    """
        Creates the Wide Deep Q-Network Agent that iterates with the environment
        In addition, this agent can be set up to purely Linear or DQN Agent
        """

    def __init__(self, args):
        # Load parameters from user-given arguments

        self.params = json_to_dict(args["path"])
        os.environ["CUDA_VISIBLE_DEVICES"] = str(self.params["GPU"])
        self.params['width'] = args['width']
        self.params['height'] = args['height']
        self.params['num_training'] = args['numTraining']
        self.params['num_games'] = args['numGames']
        self.path_extra = ""
        self.params["seed"] = args['seed']
        self.random = np.random.RandomState(self.params["seed"])
        self.beta_schedule = None
        # time started
        self.general_record_time = time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime())
        self.start_time = time.time()
        self.rank_sort = None

        if self.params["prioritized"]: # For using PrioritizedReplayBuffer
            if self.params["ranked"]:
                N_list = [self.params["batch_size"]] + [int(x) for x in np.linspace(100, self.params["mem_size"], 5)]
                save_quantiles(N_list=N_list, k=self.params["batch_size"],
                               alpha=self.params["prioritized_replay_alpha"], name=self.params["save_file"])
                self.replay_buffer = RankBasedReplay(self.params["mem_size"],
                                                     self.params["prioritized_replay_alpha"],
                                                     name=self.params["save_file"])
                if self.params["sort_rank"] == None:  # For sorting rankbased buffer
                    self.rank_sort = int(self.params["mem_size"] * 0.01)
                else:
                    self.rank_sort = self.params["sort_rank"]
            else:
                self.replay_buffer = PrioritizedReplayBuffer(self.params["mem_size"],
                                                             self.params["prioritized_replay_alpha"])
            if self.params["prioritized_replay_beta_iters"] is None:
                prioritized_replay_beta_iters = self.params['num_training']
            else:
                prioritized_replay_beta_iters = self.params['prioritized_replay_beta_iters']

            self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                                initial_p=self.params['prioritized_replay_beta0'],
                                                final_p=1.0)
        else:
            self.replay_buffer = ReplayBuffer(self.params["mem_size"])
            self.beta_schedule = None

        if self.params["only_dqn"]:
            print("Initialise DQN Agent")
        elif self.params["only_lin"]:
            print("Initialise Linear Approximative Agent")
        else:
            print("Initialise WDQN Agent")

        print(self.params["save_file"])
        if self.params["prioritized"]:
            if self.params["ranked"]:
                print("Using Rank-Based Experience Replay Buffer")
            else:
                print("Using Prioritized Experience Replay Buffer")

        if self.params["model_shift"]:
            print("Using Model Shift")

        print("seed", self.params["seed"])
        print("Starting time:", self.general_record_time)

        # Start Tensorflow session
        tf.reset_default_graph()
        tf.set_random_seed(self.params["seed"])
        self.qnet = WDQN(self.params, "model")  # Q-network
        self.tnet = WDQN(self.params, "target_model")  # Q-target-network
        self.saver = tf.train.Saver()
        self.sess = tf.Session()
        self.qnet.set_session(self.sess)
        self.tnet.set_session(self.sess)
        self.sess.run(tf.global_variables_initializer())

        # Q and cost
        self.Q_global = []

        # Stats
        self.cnt = self.qnet.sess.run(self.qnet.global_step_dqn)
        self.local_cnt = 0
        self.wins = 0
        self.best_int = self.params["shift_best"]

        self.numeps = 0
        self.model_eps = 0
        self.episodeStartTime = time.time()
        self.last_steps = 0

        self.get_direction = lambda k: ['North', 'South', 'East', 'West', 'Stop'][k]
        self.get_value = {'North': 0, 'South': 1, 'East': 2, 'West': 3, 'Stop': 4}

        self.lastWindowAccumRewards = 0.0
        self.Q_accumulative = 0.0
        self.accumTrainRewards = 0.0
        self.sub_dir = str(self.params["save_interval"])

    def registerInitialState(self, state):
        """Inspects the starting state"""
        # Reset reward
        self.last_score = 0
        self.last_reward = 0.

        # Reset state
        self.last_state = None
        self.current_state = state

        # Reset actions
        self.last_action = None

        # Reset vars
        self.terminal = None
        self.won = True
        self.Q_global = []

        # Shift Model between WDQN and DQN during training
        if self.params["model_shift"] and (self.numeps + 1) <= self.params['num_training']:
            if (self.numeps +1) >= self.params["start_shift"] and (self.numeps +1)  % self.params["val_shift"] == 0:
                if self.params["only_dqn"]:
                    self.params["only_dqn"] = False
                    print("Using WDQN Agent starting from eps", (self.numeps + 1))
                else:
                    self.params["only_dqn"] = True
                    print("Using DQN Agent starting from eps", (self.numeps + 1))
            if self.params["model_shift"] and (self.numeps + 1) == self.params['num_training'] and not self.params["only_dqn"]:  # Back to WDQN at the end
                self.params["only_dqn"] = False
                print("Back to WDQN Agent for testing")

        # Load model
        self.load_mod()

        # Next
        self.numeps += 1


    def getQvalues(self, model, dropout):
        """Access Q Values by using the model prediction of WDQN.py"""
        if self.params["only_dqn"]:
            return model.predict_dqn(map_state_mat(self.current_state), dropout)[0]
        elif self.params["only_lin"]:
            return model.predict_lin(mat_features(self.current_state, ftrs=self.params["feat_val"]), dropout)[0]
        else:
            return model.predict_wdqn(map_state_mat(self.current_state),
                                      mat_features(self.current_state, ftrs=self.params["feat_val"]), dropout)[0]

    def getPolicy(self, model, dropout=1.0):
        """Pick up the policy """
        qValues = self.getQvalues(model, dropout)
        qVal = {self.get_value[l]: qValues[self.get_value[l]] for l in self.current_state.getLegalActions(0) if
                not l == "Stop"}
        maxValue = max(qVal.values())
        self.Q_global.append(maxValue)
        return self.get_direction(self.random.choice([k for k in qVal.keys() if qVal[k] == maxValue]))

    def getAction(self, state):
        """Exploit / Explore"""
        if self.random.rand() > self.params['eps']:
            # Exploit action
            move = self.getPolicy(self.qnet)  # dropout deactivated
        else:
            legal = [v for v in state.getLegalActions(0) if not v == "Stop"]
            move = self.random.choice(legal)
        # Save last_action
        self.last_action = self.get_value[move]
        return move

    def observationFunction(self, state):
        """Do observation"""
        self.terminal = False
        self.observation_step(state)
        return state

    def observation_step(self, state):
        """
        Realize the observation step
        Rewards are balanced in this part
        The training occurs in this section
        """
        if self.last_action is not None:
            # Process current experience state
            self.last_state = self.current_state.deepCopy()
            self.current_state = state
            # Process current experience reward
            reward = state.getScore() - self.last_score
            self.last_score = state.getScore()
            # Reward system
            if reward > 20:
                self.last_reward = 50  # 0.1 # Eat ghost
            elif reward > 0:
                self.last_reward = 10  # 0.02 # Eat food
            elif reward < -10:
                self.last_reward = -500.  # -1 # Get eaten
                self.won = False
            elif reward < 0:
                self.last_reward = -1  # -0.002 # Punish time
            if (self.terminal and self.won):
                self.last_reward = 100  # 0.2 # Won

            if self.isInTraining():
                # Copy values to target network
                if self.local_cnt % self.params["target_update_network"] == 0 \
                        and self.local_cnt > self.params['train_start']:
                    self.tnet.rep_network(self.qnet)
                    print("Copied model parameters to target network. total_t = %s, period = %s" % (
                        self.local_cnt, self.params["target_update_network"]))

                # Store last experience into memory
                if self.params["prioritized"] and self.params["ranked"]:
                    self.replay_buffer.add((self.last_state, self.last_action, float(self.last_reward),
                                            self.current_state,
                                            self.terminal))
                else:
                    self.replay_buffer.add(self.last_state, self.last_action, float(self.last_reward),
                                           self.current_state,
                                           self.terminal)
                # Train
                self.train()

                # Next
                self.local_cnt += 1
                if self.local_cnt == self.params['train_start']:
                    print("")
                    print("Memory Replay populated")
                    print("")
                    self.model_eps = self.numeps

                    # with open('data/lin_rb.pickle', 'wb') as handle:
                    #    pickle.dump(self.replay_buffer, handle)
                    #    print("Pickle Saved")
                    #    print(10 + "n")

                self.params['eps'] = max(self.params['eps_final'],
                                         1.00 - float(self.cnt) / float(self.params['eps_step']))

    def train(self):
        """Train different agents: WDQN, DQN and Linear"""
        if self.local_cnt > self.params['train_start']:
            if self.params["only_dqn"]:
                batch_s_dqn, batch_a, batch_t, qt_dqn, batch_r, batch_idxes, weights = extract_batches_per(self.params,
                                                                                                           self.tnet,
                                                                                                           self.replay_buffer,
                                                                                                           self.beta_schedule,
                                                                                                           (self.numeps-self.model_eps))
                self.cnt, td_errors = self.qnet.train(batch_s_dqn, None, batch_a, batch_t, qt_dqn, None, None, batch_r,
                                                      self.params["dropout"],
                                                      self.params["only_dqn"],
                                                      self.params["only_lin"], weights)

            elif self.params["only_lin"]:
                batch_s_lin, batch_a, batch_t, qt_lin, batch_r, batch_idxes, weights = extract_batches_per(self.params,
                                                                                                           self.tnet,
                                                                                                           self.replay_buffer,
                                                                                                           self.beta_schedule,
                                                                                                           (self.numeps-self.model_eps))
                self.cnt, td_errors = self.qnet.train(None, batch_s_lin, batch_a, batch_t, None, qt_lin, None, batch_r,
                                                      self.params["dropout"],
                                                      self.params["only_dqn"],
                                                      self.params["only_lin"], weights)
            else:
                batch_s_dqn, batch_s_lin, batch_a, batch_t, qt_lin, qt_dqn, qt_wdqn, batch_r, batch_idxes, weights = extract_batches_per(
                    self.params, self.tnet,
                    self.replay_buffer, self.beta_schedule, (self.numeps-self.model_eps))
                self.cnt, td_errors = self.qnet.train(batch_s_dqn, batch_s_lin, batch_a, batch_t, qt_dqn, qt_lin,
                                                      qt_wdqn,
                                                      batch_r,
                                                      self.params["dropout"],
                                                      self.params["only_dqn"],
                                                      self.params["only_lin"], weights)

            if self.params["prioritized"]:
                new_priorities = np.abs(td_errors) + self.params["prioritized_replay_eps"]
                self.replay_buffer.update_priorities(batch_idxes, new_priorities)
                if self.params["ranked"] and self.cnt % self.rank_sort == 0:
                    self.replay_buffer.sort()

    def final(self, state):
        """Inspects the last state"""
        # Do observation
        self.terminal = True
        self.observation_step(state)
        NUM_EPS_UPDATE = 100
        self.lastWindowAccumRewards += state.getScore()  #
        self.accumTrainRewards += state.getScore()
        self.Q_accumulative += max(self.Q_global, default=float('nan'))
        self.wins += self.won

        if self.numeps % NUM_EPS_UPDATE == 0:
            # Print stats
            eps_time = time.time() - self.episodeStartTime
            print('Reinforcement Learning Status:')
            if self.numeps <= self.params['num_training']:
                trainAvg = self.accumTrainRewards / float(self.numeps)
                print('\tCompleted %d out of %d training episodes' % (
                    self.numeps, self.params['num_training']))
                print('\tAverage Rewards over all training: %.2f' % (
                    trainAvg))

            windowAvg = self.lastWindowAccumRewards / float(NUM_EPS_UPDATE)
            windowQavg = self.Q_accumulative / float(NUM_EPS_UPDATE)
            window_steps = (self.cnt - self.last_steps) / float(NUM_EPS_UPDATE)
            print('\tAverage Rewards for last %d episodes: %.2f' % (
                NUM_EPS_UPDATE, windowAvg))
            print('\tEpisode took %.2f seconds' % (eps_time))
            print('\tEpisilon is %.8f' % self.params["eps"])
            print('\tLinear Decay learning Rate is %.8f' % self.sess.run(self.qnet.lr_lin))
            if self.params["save_logs"]:
                log_file = open('logs/' + self.params["save_file"] + "-" + str(self.general_record_time) + '-l-' + str(
                    (self.params["num_training"])) + '.log',
                                'a')
                log_file.write("# %4d |  s: %8d | t: %.2f  |  r: %12f | Q: %10f | won: %r \n" %
                               (self.numeps, window_steps, eps_time, windowAvg,
                                windowQavg, self.wins))

            # Save Best Model
            if windowAvg >= self.params["best_thr"]:
                self.params["best_thr"] = windowAvg
                self.last_steps = self.cnt
                self.save_mod(best_mod=True)
                print("Saving the model with:", self.params["best_thr"])
                self.params["best_thr"] = self.params["best_thr"] + self.best_int

            sys.stdout.flush()
            self.lastWindowAccumRewards = 0
            self.Q_accumulative = 0
            self.last_steps = self.cnt
            self.wins = 0
            self.episodeStartTime = time.time()

        if self.numeps >= self.params['num_training']:
            eps_time = time.time() - self.episodeStartTime
            if self.numeps == self.params['num_training']:
                print("Starting Date of Training:", self.general_record_time)
                print("Ending Date of Training:", time.strftime("%a_%d_%b_%Y_%H_%M_%S", time.localtime()))
                print("Training time duration in minutes:", (time.time() - self.start_time) / 60)
                print('Training Done (turning off epsilon)')
            self.params["eps"] = 0.0  # no exploration

            log_file = open(
                'logs/testedModels/' + self.params["save_file"] + '-s-' + str(self.params["seed"]) + '-n-' + str(
                    self.params['num_games'] - self.params["num_training"]) + '.log',
                'a')
            log_file.write("# %4d |  s: %8d | t: %.2f  |  r: %12f | Q: %10f | won: %r \n" %
                           (self.numeps, self.cnt - self.last_steps, eps_time, state.getScore(),
                            max(self.Q_global, default=float('nan'))
                            , int(self.won)))
            self.last_steps = self.cnt
        # save model
        self.save_mod(best_mod=False)

    def isInTraining(self):
        """Check is if agent is in training"""
        return self.numeps < self.params["num_training"]

    def isInTesting(self):
        """Check is if agent is in testing"""
        return not self.isInTraining()

    def load_mod(self):
        """ Load data and model"""
        if self.params["load"]:
            try:
                print("".join([self.path_extra, "model/", self.params["save_file"], "-", self.params["load_file"]]))

                self.saver.restore(self.sess,
                                   "".join([self.path_extra, "model/", self.params["save_file"], "-",
                                            self.params["load_file"]]))
                if not self.params["load_file"].lower() == "best":
                    print("Model Restored")
                else:
                    print("Best Model Restored")
                try:
                    load_path = "".join(
                        [self.path_extra, "parameters/", "params_", self.params["save_file"], "-",
                         self.params["load_file"].lower(), ".npy"])

                    # Parameters to be preserved for params when charging
                    save, save_interval, num_tr, load_data, best_thr, eps_final, dropout, decay_lr, decay_lr_val, only_dqn, only_lin, load_file, num_games, seed, save_logs = \
                        self.params["save"], self.params["save_interval"], \
                        self.params["num_training"], self.params["load_data"], \
                        self.params["best_thr"], self.params["eps_final"], self.params["dropout"], self.params[
                            "dcy_lrl"], \
                        self.params["dcy_lrl_val"], self.params["only_dqn"], self.params["only_lin"], self.params[
                            "load_file"], self.params["num_games"], self.params["seed"], self.params["save_logs"]

                    # Load saved parameters and hyperparameters of the new starting point
                    self.last_steps, self.accumTrainRewards, self.numeps, self.params, self.local_cnt, self.cnt, self.sub_dir = np.load(
                        load_path)  #
                    orig_num_training = self.params["num_training"]
                    # Load newest Parameters to params
                    orig_save_int = self.params["save_interval"]

                    self.params["save"], self.params["save_interval"], self.params["num_training"], \
                    self.params["load_data"], self.params["best_thr"], self.params["eps_final"], self.params["dropout"], \
                    self.params["dcy_lrl"], self.params["dcy_lrl_val"], self.params["only_dqn"], self.params[
                        "only_lin"], self.params["load_file"], \
                    self.params["num_games"], self.params["seed"], self.params["save_logs"] = save, save_interval, \
                                                                                              num_tr, load_data, best_thr, eps_final, \
                                                                                              dropout, decay_lr, decay_lr_val, only_dqn, \
                                                                                              only_lin, load_file, num_games, seed, save_logs

                    if self.sub_dir == "best":
                        print("Best Parameters Restored")
                    else:
                        print("Parameters Restored")

                    if self.params["load_data"]:  # Load data and starts with correct data
                        self.params["num_training"] += orig_num_training
                        if not self.params["load_file"].lower() == "best":
                            src = "".join(
                                [self.path_extra, "data/mem_rep_", self.params["save_file"], "/",
                                 self.params["save_file"], "-", str(self.sub_dir), ".pickle"])

                            print("Interval Data to be Restored")
                        else:
                            src = "".join(
                                [self.path_extra, "data/mem_rep_", self.params["save_file"], "/",
                                 self.params["save_file"], "-", self.sub_dir, ".pickle"])
                            print("Best Data to be Restored")
                        with open(src, 'rb') as handle:
                            self.replay_buffer = pickle.load(handle)
                        print("Data Restored")
                except Exception as e:
                    print(e)
                    print("Parameters don't exist or could not be properly loaded")
            except:
                print("Model don't exist or could not be properly loaded")
            self.params["load"] = False

    def save_mod(self, best_mod=False):
        """
        Saving model and parameters
        Possibility of saving the best model
        """
        if (self.numeps % self.params["save_interval"] == 0 and self.params["save"]) or (
                    best_mod and self.params["save"]):
            self.params["global_step"] = self.cnt
            save_files = [self.last_steps, self.accumTrainRewards, self.numeps, self.params, self.local_cnt, self.cnt]
            try:
                if best_mod:
                    self.saver.save(self.sess,
                                    "".join([self.path_extra, "model/", self.params["save_file"], "-", "best"]))
                    print("Best Model Saved")
                elif not self.sub_dir == "best":
                    self.saver.save(self.sess, "".join(
                        [self.path_extra, "model/", self.params["save_file"], "-", str(self.numeps)]))
                    print("Model Saved")
            except Exception as e:
                print("Model could not be saved")
                print("Error", e)
            try:
                if str(self.numeps) == self.sub_dir:  # Save memory replay and parameters
                    dic = "".join(
                        [self.path_extra, "data/mem_rep_", self.params["save_file"], "/"])
                    f_name = "".join([self.params["save_file"], "-", str(self.sub_dir), ".pickle"])
                    # 1
                    save_files.append(self.sub_dir)

                    np.save("".join(
                        [self.path_extra, "parameters/", "params_", self.params["save_file"], "-", str(self.numeps)]),
                        save_files)
                    print("Pameters Saved")
                    save_rep_buf(self.replay_buffer, dic, f_name)
                    self.sub_dir = str(self.numeps + self.params["save_interval"])
                    print("Memory Replay Saved")
                elif best_mod:  # Save memory replay of best model in directory "best" and parameters
                    dic = "".join(
                        [self.path_extra, "data/mem_rep_", self.params["save_file"], "/"])
                    f_name = "".join([self.params["save_file"], "-", "best", ".pickle"])
                    save_files.append("best")
                    np.save("".join([self.path_extra, "parameters/", "params_", self.params["save_file"], "-", "best"]),
                            save_files)
                    print("Best Pameters Saved")
                    save_rep_buf(self.replay_buffer, dic, f_name)
                    print("Best Memory Replay Saved")
            except Exception as e:
                print("Parameters could not be saved")
                print("Error", e)